318 files changed, 40173 insertions, 13988 deletions
diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt
index 4fef40a86..34408c017 100644
--- a/unsupported/CMakeLists.txt
+++ b/unsupported/CMakeLists.txt
@@ -1,7 +1,11 @@
 add_subdirectory(Eigen)
-add_subdirectory(doc EXCLUDE_FROM_ALL)
-if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
-  add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
-else()
-  add_subdirectory(test EXCLUDE_FROM_ALL)
+if(EIGEN_BUILD_DOC)
+  add_subdirectory(doc EXCLUDE_FROM_ALL)
+endif()
+if(BUILD_TESTING)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(test EXCLUDE_FROM_ALL)
+  endif()
 endif()
diff --git a/unsupported/Eigen/AdolcForward b/unsupported/Eigen/AdolcForward
index 15f5f0731..56caeaebf 100644
--- a/unsupported/Eigen/AdolcForward
+++ b/unsupported/Eigen/AdolcForward
@@ -40,7 +40,7 @@
 # undef realloc
 #endif
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
@@ -74,6 +74,9 @@ inline adouble imag(const adouble&)    { return 0.; }
 inline adouble abs(const adouble&  x)  { return fabs(x); }
 inline adouble abs2(const adouble& x)  { return x*x; }
 
+inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); }
+inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); }
+
 }
 
 namespace Eigen {
diff --git a/unsupported/Eigen/AlignedVector3 b/unsupported/Eigen/AlignedVector3
index 47a86d4c0..4fa1842ac 100644
--- a/unsupported/Eigen/AlignedVector3
+++ b/unsupported/Eigen/AlignedVector3
@@ -10,7 +10,9 @@
 #ifndef EIGEN_ALIGNED_VECTOR3
 #define EIGEN_ALIGNED_VECTOR3
 
-#include <Eigen/Geometry>
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 namespace Eigen {
 
@@ -76,6 +78,9 @@ template<typename _Scalar> class AlignedVector3
     { return m_coeffs.coeffRef(index);}
 
 
+    inline AlignedVector3()
+    {}
+
     inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
       : m_coeffs(x, y, z, Scalar(0))
     {}
@@ -129,6 +134,9 @@ template<typename _Scalar> class AlignedVector3
     inline AlignedVector3 operator-(const AlignedVector3& other) const
     { return AlignedVector3(m_coeffs - other.m_coeffs); }
 
+    inline AlignedVector3 operator-() const
+    { return AlignedVector3(-m_coeffs); }
+
     inline AlignedVector3 operator-=(const AlignedVector3& other)
     { m_coeffs -= other.m_coeffs; return *this; }
 
@@ -221,4 +229,6 @@ struct evaluator<AlignedVector3<Scalar> >
 
 }
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_ALIGNED_VECTOR3
diff --git a/unsupported/Eigen/ArpackSupport b/unsupported/Eigen/ArpackSupport
index 37a2799ef..67c4ac838 100644
--- a/unsupported/Eigen/ArpackSupport
+++ b/unsupported/Eigen/ArpackSupport
@@ -9,9 +9,7 @@
 #ifndef EIGEN_ARPACKSUPPORT_MODULE_H
 #define EIGEN_ARPACKSUPPORT_MODULE_H
 
-#include <Eigen/Core>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../Eigen/Core"
 
 /** \defgroup ArpackSupport_Module Arpack support module
   *
@@ -22,10 +20,11 @@
   * \endcode
   */
 
-#include <Eigen/SparseCholesky>
+#include "../../Eigen/SparseCholesky"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_ARPACKSUPPORT_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff
index abf5b7d67..7a4ff460c 100644
--- a/unsupported/Eigen/AutoDiff
+++ b/unsupported/Eigen/AutoDiff
@@ -28,11 +28,17 @@ namespace Eigen {
 //@{
 
 }
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 
 #include "src/AutoDiff/AutoDiffScalar.h"
 // #include "src/AutoDiff/AutoDiffVector.h"
 #include "src/AutoDiff/AutoDiffJacobian.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+
 namespace Eigen {
 //@}
 }
diff --git a/unsupported/Eigen/BVH b/unsupported/Eigen/BVH
index 0161a5402..666c9835f 100644
--- a/unsupported/Eigen/BVH
+++ b/unsupported/Eigen/BVH
@@ -10,9 +10,9 @@
 #ifndef EIGEN_BVH_MODULE_H
 #define EIGEN_BVH_MODULE_H
 
-#include <Eigen/Core>
-#include <Eigen/Geometry>
-#include <Eigen/StdVector>
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+#include "../../Eigen/StdVector"
 #include <algorithm>
 #include <queue>
 
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 7ecb4c74d..0938bb554 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -13,21 +13,11 @@
 
 #include "../../../Eigen/Core"
 
-#ifdef EIGEN_USE_SYCL
-#undef min
-#undef max
-#undef isnan
-#undef isinf
-#undef isfinite
-#include <SYCL/sycl.hpp>
-#include <map>
-#include <memory>
-#include <utility>
-#endif
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#if EIGEN_HAS_CXX11
 
 #include "../SpecialFunctions"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
@@ -39,46 +29,29 @@
   * \code
   * #include <Eigen/CXX11/Tensor>
   * \endcode
+  *
+  * Much of the documentation can be found \ref eigen_tensors "here".
   */
 
+#include <atomic>
+#include <chrono>
 #include <cmath>
 #include <cstddef>
 #include <cstring>
-
-#ifdef _WIN32
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
 #include <random>
-#endif
-
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
+#include <thread>
 
-#ifdef EIGEN_USE_THREADS
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
 #include "ThreadPool"
 #endif
 
 #ifdef EIGEN_USE_GPU
-#include <iostream>
-#include <cuda_runtime.h>
-#if __cplusplus >= 201103L
-#include <atomic>
-#include <unistd.h>
-#endif
+  #include <iostream>
+  #if defined(EIGEN_USE_HIP)
+    #include <hip/hip_runtime.h>
+  #else
+    #include <cuda_runtime.h>
+  #endif
 #endif
 
 #include "src/Tensor/TensorMacros.h"
@@ -88,7 +61,10 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceCuda.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
 #include "src/Tensor/TensorDeviceSycl.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
@@ -101,18 +77,19 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
 
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionCuda.h"
+#include "src/Tensor/TensorReductionGpu.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionCuda.h"
+#include "src/Tensor/TensorContractionGpu.h"
 #include "src/Tensor/TensorConversion.h"
 #include "src/Tensor/TensorConvolution.h"
 #include "src/Tensor/TensorFFT.h"
@@ -134,8 +111,15 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGenerator.h"
 #include "src/Tensor/TensorAssign.h"
 #include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
 
-#include "src/Tensor/TensorSycl.h"
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"
 
@@ -147,6 +131,7 @@ typedef unsigned __int64 uint64_t;
 
 #include "src/Tensor/TensorIO.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
+#endif  // EIGEN_HAS_CXX11
 //#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/unsupported/Eigen/CXX11/TensorSymmetry b/unsupported/Eigen/CXX11/TensorSymmetry
index fb1b0c0fb..b09c5e472 100644
--- a/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/unsupported/Eigen/CXX11/TensorSymmetry
@@ -10,9 +10,9 @@
 #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
 #define EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
-#include <unsupported/Eigen/CXX11/Tensor>
+#include "Tensor"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #include "src/util/CXX11Meta.h"
 
@@ -33,7 +33,7 @@
 #include "src/TensorSymmetry/StaticSymmetry.h"
 #include "src/TensorSymmetry/DynamicSymmetry.h"
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
diff --git a/unsupported/Eigen/CXX11/ThreadPool b/unsupported/Eigen/CXX11/ThreadPool
index 09d637e9a..c5cafb2a1 100644
--- a/unsupported/Eigen/CXX11/ThreadPool
+++ b/unsupported/Eigen/CXX11/ThreadPool
@@ -12,7 +12,7 @@
 
 #include "../../../Eigen/Core"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
   *
@@ -30,10 +30,9 @@
 
 // The code depends on CXX11, so only include the module if the
 // compiler supports it.
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+#if (EIGEN_COMP_CXXVER >= 11)
 #include <cstddef>
 #include <cstring>
-#include <stdint.h>
 #include <time.h>
 
 #include <vector>
@@ -44,22 +43,32 @@
 #include <thread>
 #include <functional>
 #include <memory>
+#include <utility>
+
+// There are non-parenthesized calls to "max" in the  <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
+#endif
+#include <unordered_map>
 
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
 #include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/ThreadCancel.h"
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
-#include "src/ThreadPool/SimpleThreadPool.h"
+#include "src/ThreadPool/Barrier.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
 
 #endif
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CXX11_THREADPOOL_MODULE
-
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 98e83811b..2f65b1b0e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1,14 +1,12 @@
-# Eigen Tensors
+# Eigen Tensors {#eigen_tensors}
 
 Tensors are multidimensional arrays of elements. Elements are typically scalars,
 but more complex types such as strings are also supported.
 
-[TOC]
-
 ## Tensor Classes
 
 You can manipulate a tensor with one of the following classes.  They all are in
-the namespace ```::Eigen.```
+the namespace `::Eigen.`
 
 
 ### Class Tensor<data_type, rank>
@@ -23,8 +21,8 @@ different size to a Tensor, that tensor is resized to match its new value.
 
 #### Constructor Tensor<data_type, rank>(size0, size1, ...)
 
-Constructor for a Tensor.  The constructor must be passed ```rank``` integers
-indicating the sizes of the instance along each of the the ```rank```
+Constructor for a Tensor.  The constructor must be passed `rank` integers
+indicating the sizes of the instance along each of the the `rank`
 dimensions.
 
     // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
@@ -38,7 +36,7 @@ dimensions.
 
 Constructor where the sizes for the constructor are specified as an array of
 values instead of an explicitly list of parameters.  The array type to use is
-```Eigen::array<Eigen::Index>```.  The array can be constructed automatically
+`Eigen::array<Eigen::Index>`.  The array can be constructed automatically
 from an initializer list.
 
     // Create a tensor of strings of rank 2 with sizes 5, 7.
@@ -83,8 +81,8 @@ large enough to hold all the data.
 
     // You can also map fixed-size tensors.  Here we get a 1d view of
     // the 2d fixed-size tensor.
-    Tensor<float, Sizes<4, 5>> t_4x3;
-    TensorMap<Tensor<float, 1>> t_12(t_4x3, 12);
+    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+    TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
 
 
 #### Class TensorRef
@@ -95,8 +93,8 @@ See Assigning to a TensorRef below.
 
 #### <data_type> tensor(index0, index1...)
 
-Return the element at position ```(index0, index1...)``` in tensor
-```tensor```.  You must pass as many parameters as the rank of ```tensor```.
+Return the element at position `(index0, index1...)` in tensor
+`tensor`.  You must pass as many parameters as the rank of `tensor`.
 The expression can be used as an l-value to set the value of the element at the
 specified position.  The value returned is of the datatype of the tensor.
 
@@ -121,8 +119,8 @@ specified position.  The value returned is of the datatype of the tensor.
 
 ## TensorLayout
 
-The tensor library supports 2 layouts: ```ColMajor``` (the default) and
-```RowMajor```.  Only the default column major layout is currently fully
+The tensor library supports 2 layouts: `ColMajor` (the default) and
+`RowMajor`.  Only the default column major layout is currently fully
 supported, and it is therefore not recommended to attempt to use the row major
 layout at the moment.
 
@@ -136,7 +134,7 @@ All the arguments to an expression must use the same layout. Attempting to mix
 different layouts will result in a compilation error.
 
 It is possible to change the layout of a tensor or an expression using the
-```swap_layout()``` method.  Note that this will also reverse the order of the
+`swap_layout()` method.  Note that this will also reverse the order of the
 dimensions.
 
     Tensor<float, 2, ColMajor> col_major(2, 4);
@@ -173,35 +171,35 @@ the following code computes the elementwise addition of two tensors:
     Tensor<float, 3> t3 = t1 + t2;
 
 While the code above looks easy enough, it is important to understand that the
-expression ```t1 + t2``` is not actually adding the values of the tensors.  The
+expression `t1 + t2` is not actually adding the values of the tensors.  The
 expression instead constructs a "tensor operator" object of the class
 TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors
-```t1``` and ```t2```.  This is a small C++ object that knows how to add
-```t1``` and ```t2```.  It is only when the value of the expression is assigned
-to the tensor ```t3``` that the addition is actually performed.  Technically,
-this happens through the overloading of ```operator=()``` in the Tensor class.
+`t1` and `t2`.  This is a small C++ object that knows how to add
+`t1` and `t2`.  It is only when the value of the expression is assigned
+to the tensor `t3` that the addition is actually performed.  Technically,
+this happens through the overloading of `operator=()` in the Tensor class.
 
 This mechanism for computing tensor expressions allows for lazy evaluation and
 optimizations which are what make the tensor library very fast.
 
-Of course, the tensor operators do nest, and the expression ```t1 + t2 *
-0.3f``` is actually represented with the (approximate) tree of operators:
+Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f`
+is actually represented with the (approximate) tree of operators:
 
     TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
 
 
 ### Tensor Operations and C++ "auto"
 
-Because Tensor operations create tensor operators, the C++ ```auto``` keyword
+Because Tensor operations create tensor operators, the C++ `auto` keyword
 does not have its intuitive meaning.  Consider these 2 lines of code:
 
     Tensor<float, 3> t3 = t1 + t2;
     auto t4 = t1 + t2;
 
-In the first line we allocate the tensor ```t3``` and it will contain the
-result of the addition of ```t1``` and ```t2```.  In the second line, ```t4```
+In the first line we allocate the tensor `t3` and it will contain the
+result of the addition of `t1` and `t2`.  In the second line, `t4`
 is actually the tree of tensor operators that will compute the addition of
-```t1``` and ```t2```.  In fact, ```t4``` is *not* a tensor and you cannot get
+`t1` and `t2`.  In fact, `t4` is *not* a tensor and you cannot get
 the values of its elements:
 
     Tensor<float, 3> t3 = t1 + t2;
@@ -210,8 +208,8 @@ the values of its elements:
     auto t4 = t1 + t2;
     cout << t4(0, 0, 0);  // Compilation error!
 
-When you use ```auto``` you do not get a Tensor as a result but instead a
-non-evaluated expression.  So only use ```auto``` to delay evaluation.
+When you use `auto` you do not get a Tensor as a result but instead a
+non-evaluated expression.  So only use `auto` to delay evaluation.
 
 Unfortunately, there is no single underlying concrete type for holding
 non-evaluated expressions, hence you have to use auto in the case when you do
@@ -257,9 +255,9 @@ There are several ways to control when expressions are evaluated:
 #### Assigning to a Tensor, TensorFixedSize, or TensorMap.
 
 The most common way to evaluate an expression is to assign it to a Tensor.  In
-the example below, the ```auto``` declarations make the intermediate values
+the example below, the `auto` declarations make the intermediate values
 "Operations", not Tensors, and do not cause the expressions to be evaluated.
-The assignment to the Tensor ```result``` causes the evaluation of all the
+The assignment to the Tensor `result` causes the evaluation of all the
 operations.
 
     auto t3 = t1 + t2;             // t3 is an Operation.
@@ -272,7 +270,7 @@ Operation to a TensorFixedSize instead of a Tensor, which is a bit more
 efficient.
 
     // We know that the result is a 4x4x2 tensor!
-    TensorFixedSize<float, 4, 4, 2> result = t5;
+    TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
 
 Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
 tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
@@ -282,7 +280,7 @@ have the rank and sizes of the expression that are assigned to them.
 
 When you compute large composite expressions, you sometimes want to tell Eigen
 that an intermediate value in the expression tree is worth evaluating ahead of
-time.  This is done by inserting a call to the ```eval()``` method of the
+time.  This is done by inserting a call to the `eval()` method of the
 expression Operation.
 
     // The previous example could have been written:
@@ -291,15 +289,15 @@ expression Operation.
     // If you want to compute (t1 + t2) once ahead of time you can write:
     Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
 
-Semantically, calling ```eval()``` is equivalent to materializing the value of
+Semantically, calling `eval()` is equivalent to materializing the value of
 the expression in a temporary Tensor of the right size.  The code above in
 effect does:
 
     // .eval() knows the size!
-    TensorFixedSize<float, 4, 4, 2> tmp = t1 + t2;
+    TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
     Tensor<float, 3> result = (tmp * 0.2f).exp();
 
-Note that the return value of ```eval()``` is itself an Operation, so the
+Note that the return value of `eval()` is itself an Operation, so the
 following code does not do what you may think:
 
     // Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
@@ -312,24 +310,24 @@ following code does not do what you may think:
     // an intermediate tensor to represent t3.x
     Tensor<float, 3> result = t4;
 
-While in the examples above calling ```eval()``` does not make a difference in
+While in the examples above calling `eval()` does not make a difference in
 performance, in other cases it can make a huge difference.  In the expression
-below the ```broadcast()``` expression causes the ```X.maximum()``` expression
+below the `broadcast()` expression causes the `X.maximum()` expression
 to be evaluated many times:
 
     Tensor<...> X ...;
     Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
                      * beta).exp();
 
-Inserting a call to ```eval()``` between the ```maximum()``` and
-```reshape()``` calls guarantees that maximum() is only computed once and
+Inserting a call to `eval()` between the `maximum()` and
+`reshape()` calls guarantees that maximum() is only computed once and
 greatly speeds-up execution:
 
     Tensor<...> Y =
       ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
         * beta).exp();
 
-In the other example below, the tensor ```Y``` is both used in the expression
+In the other example below, the tensor `Y` is both used in the expression
 and its assignment.  This is an aliasing problem and if the evaluation is not
 done in the right order Y will be updated incrementally during the evaluation
 resulting in bogus results:
@@ -337,8 +335,8 @@ resulting in bogus results:
      Tensor<...> Y ...;
      Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
 
-Inserting a call to ```eval()``` between the ```sum()``` and ```reshape()```
-expressions ensures that the sum is computed before any updates to ```Y``` are
+Inserting a call to `eval()` between the `sum()` and `reshape()`
+expressions ensures that the sum is computed before any updates to `Y` are
 done.
 
      Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
@@ -347,8 +345,8 @@ Note that an eval around the full right hand side expression is not needed
 because the generated has to compute the i-th value of the right hand side
 before assigning it to the left hand side.
 
-However, if you were assigning the expression value to a shuffle of ```Y```
-then you would need to force an eval for correctness by adding an ```eval()```
+However, if you were assigning the expression value to a shuffle of `Y`
+then you would need to force an eval for correctness by adding an `eval()`
 call for the right hand side:
 
      Y.shuffle(...) =
@@ -361,7 +359,7 @@ If you need to access only a few elements from the value of an expression you
 can avoid materializing the value in a full tensor by using a TensorRef.
 
 A TensorRef is a small wrapper class for any Eigen Operation.  It provides
-overloads for the ```()``` operator that let you access individual values in
+overloads for the `()` operator that let you access individual values in
 the expression.  TensorRef is convenient, because the Operation themselves do
 not provide a way to access individual elements.
 
@@ -390,7 +388,7 @@ such as contractions and convolutions.  The implementations are optimized for
 different environments: single threaded on CPU, multi threaded on CPU, or on a
 GPU using cuda.  Additional implementations may be added later.
 
-You can choose which implementation to use with the ```device()``` call.  If
+You can choose which implementation to use with the `device()` call.  If
 you do not choose an implementation explicitly the default implementation that
 uses a single thread on the CPU is used.
 
@@ -406,7 +404,7 @@ single-threaded CPU implementation:
     Tensor<float, 2> b(30, 40);
     Tensor<float, 2> c = a + b;
 
-To choose a different implementation you have to insert a ```device()``` call
+To choose a different implementation you have to insert a `device()` call
 before the assignment of the result.  For technical C++ reasons this requires
 that the Tensor for the result be declared on its own.  This means that you
 have to know the size of the result.
@@ -414,24 +412,27 @@ have to know the size of the result.
     Eigen::Tensor<float, 2> c(30, 40);
     c.device(...) = a + b;
 
-The call to ```device()``` must be the last call on the left of the operator=.
+The call to `device()` must be the last call on the left of the operator=.
 
-You must pass to the ```device()``` call an Eigen device object.  There are
+You must pass to the `device()` call an Eigen device object.  There are
 presently three devices you can use: DefaultDevice, ThreadPoolDevice and
 GpuDevice.
 
 
 #### Evaluating With the DefaultDevice
 
-This is exactly the same as not inserting a ```device()``` call.
+This is exactly the same as not inserting a `device()` call.
 
     DefaultDevice my_device;
     c.device(my_device) = a + b;
 
 #### Evaluating with a Thread Pool
 
+    // Create the Eigen ThreadPool
+    Eigen::ThreadPool pool(8 /* number of threads in pool */)
+
     // Create the Eigen ThreadPoolDevice.
-    Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */);
+    Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
 
     // Now just use the device when evaluating expressions.
     Eigen::Tensor<float, 2> c(30, 50);
@@ -454,20 +455,20 @@ that are tensor-type specific:
 
 #### <Tensor-Type>::Dimensions
 
-Acts like an array of ints.  Has an ```int size``` attribute, and can be
+Acts like an array of ints.  Has an `int size` attribute, and can be
 indexed like an array to access individual values.  Used to represent the
-dimensions of a tensor.  See ```dimensions()```.
+dimensions of a tensor.  See `dimensions()`.
 
 #### <Tensor-Type>::Index
 
-Acts like an ```int```.  Used for indexing tensors along their dimensions.  See
-```operator()```, ```dimension()```, and ```size()```.
+Acts like an `int`.  Used for indexing tensors along their dimensions.  See
+`operator()`, `dimension()`, and `size()`.
 
 #### <Tensor-Type>::Scalar
 
 Represents the datatype of individual tensor elements.  For example, for a
-```Tensor<float>```, ```Scalar``` is the type ```float```.  See
-```setConstant()```.
+`Tensor<float>`, `Scalar` is the type `float`.  See
+`setConstant()`.
 
 #### <Operation>
 
@@ -501,7 +502,7 @@ known as the tensor "rank".
 ### Dimensions dimensions()
 
 Returns an array-like object representing the dimensions of the tensor.
-The actual type of the dimensions() result is <Tensor-Type>::Dimensions.
+The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`.
 
     Eigen::Tensor<float, 2> a(3, 4);
     const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
@@ -509,7 +510,7 @@ The actual type of the dimensions() result is <Tensor-Type>::Dimensions.
          << ", dim 1: " << d[1];
     => Dim size: 2, dim 0: 3, dim 1: 4
 
-If you use a C++11 compiler, you can use ```auto``` to simplify the code:
+If you use a C++11 compiler, you can use `auto` to simplify the code:
 
     const auto& d = a.dimensions();
     cout << "Dim size: " << d.size << ", dim 0: " << d[0]
@@ -519,7 +520,7 @@ If you use a C++11 compiler, you can use ```auto``` to simplify the code:
 ### Index dimension(Index n)
 
 Returns the n-th dimension of the tensor.  The actual type of the
-```dimension()``` result is ```<Tensor-Type>::Index```, but you can
+`dimension()` result is `<Tensor-Type>::``Index`, but you can
 always use it like an int.
 
       Eigen::Tensor<float, 2> a(3, 4);
@@ -530,8 +531,8 @@ always use it like an int.
 ### Index size()
 
 Returns the total number of elements in the tensor.  This is the product of all
-the tensor dimensions.  The actual type of the ```size()``` result is
-```<Tensor-Type>::Index```, but you can always use it like an int.
+the tensor dimensions.  The actual type of the `size()` result is
+`<Tensor-Type>::``Index`, but you can always use it like an int.
 
     Eigen::Tensor<float, 2> a(3, 4);
     cout << "Size: " << a.size();
@@ -540,11 +541,11 @@ the tensor dimensions.  The actual type of the ```size()``` result is
 
 ### Getting Dimensions From An Operation
 
-A few operations provide ```dimensions()``` directly,
-e.g. ```TensorReslicingOp```.  Most operations defer calculating dimensions
+A few operations provide `dimensions()` directly,
+e.g. `TensorReslicingOp`.  Most operations defer calculating dimensions
 until the operation is being evaluated.  If you need access to the dimensions
 of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
-TensorRef above), which provides ```dimensions()``` and ```dimension()``` as
+TensorRef above), which provides `dimensions()` and `dimension()` as
 above.
 
 TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
@@ -567,11 +568,11 @@ to the rank of the tensor. The content of the tensor is not initialized.
 
 ### TensorFixedSize
 
-Creates a tensor of the specified size. The number of arguments in the Size<>
+Creates a tensor of the specified size. The number of arguments in the Sizes<>
 template parameter determines the rank of the tensor. The content of the tensor
 is not initialized.
 
-    Eigen::TensorFixedSize<float, Size<3, 4>> a;
+    Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
     cout << "Rank: " << a.rank() << endl;
     => Rank: 2
     cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
@@ -581,14 +582,14 @@ is not initialized.
 
 Creates a tensor mapping an existing array of data. The data must not be freed
 until the TensorMap is discarded, and the size of the data must be large enough
-to accomodate of the coefficients of the tensor.
+to accommodate the coefficients of the tensor.
 
     float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-    Eigen::TensorMap<float, 2> a(data, 3, 4);
+    Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
     cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
     => NumRows: 3 NumCols: 4
     cout << "a(1, 2): " << a(1, 2) << endl;
-    => a(1, 2): 9
+    => a(1, 2): 7
 
 
 ## Contents Initialization
@@ -604,7 +605,7 @@ result.  These are not tensor Operations which delay evaluation.
 
 ### <Tensor-Type> setConstant(const Scalar& val)
 
-Sets all elements of the tensor to the constant value ```val```.  ```Scalar```
+Sets all elements of the tensor to the constant value `val`.  `Scalar`
 is the type of data stored in the tensor.  You can pass any value that is
 convertible to that type.
 
@@ -618,8 +619,8 @@ Returns the tensor itself in case you want to chain another call.
     12.3 12.3 12.3 12.3
     12.3 12.3 12.3 12.3
 
-Note that ```setConstant()``` can be used on any tensor where the element type
-has a copy constructor and an ```operator=()```:
+Note that `setConstant()` can be used on any tensor where the element type
+has a copy constructor and an `operator=()`:
 
     Eigen::Tensor<string, 2> a(2, 3);
     a.setConstant("yolo");
@@ -632,7 +633,7 @@ has a copy constructor and an ```operator=()```:
 
 ### <Tensor-Type> setZero()
 
-Fills the tensor with zeros.  Equivalent to ```setConstant(Scalar(0))```.
+Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
 Returns the tensor itself in case you want to chain another call.
 
     a.setZero();
@@ -653,10 +654,10 @@ If the tensor has rank N, the initializer list must be nested N times.  The
 most deeply nested lists must contains P scalars of the Tensor type where P is
 the size of the last dimension of the Tensor.
 
-For example, for a ```TensorFixedSize<float, 2, 3>``` the initializer list must
+For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must
 contains 2 lists of 3 floats each.
 
-```setValues()``` returns the tensor itself in case you want to chain another
+`setValues()` returns the tensor itself in case you want to chain another
 call.
 
     Eigen::Tensor<float, 2> a(2, 3);
@@ -693,16 +694,16 @@ want to chain another call.
      -0.211234   0.823295   0.536459 -0.0452059
       0.566198  -0.604897  -0.444451   0.257742
 
-You can customize ```setRandom()``` by providing your own random number
+You can customize `setRandom()` by providing your own random number
 generator as a template argument:
 
     a.setRandom<MyRandomGenerator>();
 
-Here, ```MyRandomGenerator``` must be a struct with the following member
-functions, where Scalar and Index are the same as ```<Tensor-Type>::Scalar```
-and ```<Tensor-Type>::Index```.
+Here, `MyRandomGenerator` must be a struct with the following member
+functions, where Scalar and Index are the same as `<Tensor-Type>::``Scalar`
+and `<Tensor-Type>::``Index`.
 
-See ```struct UniformRandomGenerator``` in TensorFunctors.h for an example.
+See `struct UniformRandomGenerator` in TensorFunctors.h for an example.
 
     // Custom number generator for use with setRandom().
     struct MyRandomGenerator {
@@ -767,7 +768,7 @@ Scalar is the type of data stored in the tensor.
 
 ## Tensor Operations
 
-All the methods documented below return non evaluated tensor ```Operations```.
+All the methods documented below return non evaluated tensor `Operations`.
 These can be chained: you can apply another Tensor Operation to the value
 returned by the method.
 
@@ -778,7 +779,7 @@ their evaluation.
 ### <Operation> constant(const Scalar& val)
 
 Returns a tensor of the same type and dimensions as the original tensor but
-where all elements have the value ```val```.
+where all elements have the value `val`.
 
 This is useful, for example, when you want to add or subtract a constant from a
 tensor, or multiply every element of a tensor by a scalar.
@@ -810,7 +811,7 @@ but where all elements have random values.
 
 This is for example useful to add random values to an existing tensor.
 The generation of random values can be customized in the same manner
-as for ```setRandom()```.
+as for `setRandom()`.
 
     Eigen::Tensor<float, 2> a(2, 3);
     a.setConstant(1.0f);
@@ -1013,16 +1014,23 @@ multidimensional case.
     Eigen::Tensor<int, 2> a(2, 3);
     a.setValues({{1, 2, 3}, {6, 5, 4}});
     Eigen::Tensor<int, 2> b(3, 2);
-    a.setValues({{1, 2}, {4, 5}, {5, 6}});
+    b.setValues({{1, 2}, {4, 5}, {5, 6}});
 
     // Compute the traditional matrix product
-    array<IndexPair<int>, 1> product_dims = { IndexPair(1, 0) };
+    Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) };
     Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
 
     // Compute the product of the transpose of the matrices
-    array<IndexPair<int>, 1> transpose_product_dims = { IndexPair(0, 1) };
+    Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) };
     Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
 
+    // Contraction to scalar value using a double contraction.
+    // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements.
+    Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
+    Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims);
+
+    // Extracting the scalar value of the tensor contraction for further usage
+    int value = AdoubleContractedA(0);
 
 ## Reduction Operations
 
@@ -1032,13 +1040,13 @@ original tensor.  The values in the returned tensor are computed by applying a
 the dimensions along which the slices are made.
 
 The Eigen Tensor library provides a set of predefined reduction operators such
-as ```maximum()``` and ```sum()``` and lets you define additional operators by
+as `maximum()` and `sum()` and lets you define additional operators by
 implementing a few methods from a reductor template.
 
 ### Reduction Dimensions
 
 All reduction operations take a single parameter of type
-```<TensorType>::Dimensions``` which can always be specified as an array of
+`<TensorType>::``Dimensions` which can always be specified as an array of
 ints.  These are called the "reduction dimensions."  The values are the indices
 of the dimensions of the input tensor over which the reduction is done.  The
 parameter can have at most as many element as the rank of the input tensor;
@@ -1164,10 +1172,62 @@ short-circuiting, so may be significantly inefficient.
 
 ### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
 
-Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
+Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
 in TensorFunctors.h for information on how to implement a reduction operator.
 
 
+## Trace
+
+A *Trace* operation returns a tensor with fewer dimensions than the original
+tensor. It returns a tensor whose elements are the sum of the elements of the
+original tensor along the main diagonal for a list of specified dimensions, the
+"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
+are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
+, and have the same requirements when passed as an input parameter. In addition,
+the trace dimensions must have the same size.
+
+Example: Trace along 2 dimensions.
+
+    // Create a tensor of 3 dimensions
+    Eigen::Tensor<int, 3> a(2, 2, 3);
+    a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
+    // Specify the dimensions along which the trace will be computed.
+    // In this example, the trace can only be computed along the dimensions
+    // with indices 0 and 1
+    Eigen::array<int, 2> dims({0, 1});
+    // The output tensor contains all but the trace dimensions.
+    Tensor<int, 1> a_trace = a.trace(dims);
+    cout << "a_trace:" << endl;
+    cout << a_trace << endl;
+    =>
+    a_trace:
+    11
+    13
+    15
+
+
+### <Operation> trace(const Dimensions& new_dims)
+### <Operation> trace()
+
+As a special case, if no parameter is passed to the operation, trace is computed
+along *all* dimensions of the input tensor.
+
+Example: Trace along all dimensions.
+
+    // Create a tensor of 3 dimensions, with all dimensions having the same size.
+    Eigen::Tensor<int, 3> a(3, 3, 3);
+    a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+                {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
+                {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
+    // Result is a zero dimension tensor
+    Tensor<int, 0> a_trace = a.trace();
+    cout<<"a_trace:"<<endl;
+    cout<<a_trace<<endl;
+    =>
+    a_trace:
+    42
+
+
 ## Scan Operations
 
 A *Scan* operation returns a tensor with the same dimensions as the original
@@ -1191,7 +1251,7 @@ dd a comment to this line
     =>
     a
     1 2 3
-    6 5 4
+    4 5 6
 
     b
     1  3  6
@@ -1273,7 +1333,7 @@ the number of elements in the input tensor.
 This operation does not move any data in the input tensor, so the resulting
 contents of a reshaped Tensor depend on the data layout of the original Tensor.
 
-For example this is what happens when you ```reshape()``` a 2D ColMajor tensor
+For example this is what happens when you `reshape()` a 2D ColMajor tensor
 to one dimension:
 
     Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
@@ -1314,7 +1374,7 @@ The previous example can be rewritten as follow:
     Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
     a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
     Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
-    Eigen::Tensor<float, 1, Eigen::ColMajor> b;
+    Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
     b.reshape(two_dim) = a;
     cout << "b" << endl << b << endl;
     =>
@@ -1378,7 +1438,7 @@ elements) along each of the dimensions.  The argument strides is an
 array of Index values.  The dimensions of the resulting tensor are
 ceil(input_dimensions[i] / strides[i]).
 
-For example this is what happens when you ```stride()``` a 2D tensor:
+For example this is what happens when you `stride()` a 2D tensor:
 
     Eigen::Tensor<int, 2> a(4, 3);
     a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
@@ -1482,7 +1542,7 @@ values that indicates whether or not the order of the coefficients should be
 reversed along each of the dimensions.  This operation preserves the dimensions
 of the input tensor.
 
-For example this is what happens when you ```reverse()``` the first dimension
+For example this is what happens when you `reverse()` the first dimension
 of a 2D tensor:
 
     Eigen::Tensor<int, 2> a(4, 3);
@@ -1568,85 +1628,83 @@ dimension in RowMajor layout.
 
 For example, given the following input tensor:
 
-  Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
-  tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
-                    {4.0f, 5.0f, 6.0f, 7.0f},
-                    {8.0f, 9.0f, 10.0f, 11.0f}});
+    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                      {4.0f, 5.0f, 6.0f, 7.0f},
+                      {8.0f, 9.0f, 10.0f, 11.0f}});
 
-  cout << "tensor: " << endl << tensor << endl;
-=>
-tensor:
- 0   1   2   3
- 4   5   6   7
- 8   9  10  11
+    cout << "tensor: " << endl << tensor << endl;
+    =>
+    tensor:
+     0   1   2   3
+     4   5   6   7
+     8   9  10  11
 
 Six 2x2 patches can be extracted and indexed using the following code:
 
-  Eigen::Tensor<float, 3, DataLayout> patch;
-  Eigen::array<ptrdiff_t, 2> patch_dims;
-  patch_dims[0] = 2;
-  patch_dims[1] = 2;
-  patch = tensor.extract_patches(patch_dims);
-  for (int k = 0; k < 6; ++k) {
-    cout << "patch index: " << k << endl;
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 2; ++j) {
-        if (DataLayout == ColMajor) {
-          cout << patch(i, j, k) << " ";
-        } else {
-          cout << patch(k, i, j) << " ";
-        }
+    Eigen::Tensor<float, 3, DataLayout> patch;
+    Eigen::array<ptrdiff_t, 2> patch_dims;
+    patch_dims[0] = 2;
+    patch_dims[1] = 2;
+    patch = tensor.extract_patches(patch_dims);
+    for (int k = 0; k < 6; ++k) {
+      cout << "patch index: " << k << endl;
+      for (int i = 0; i < 2; ++i) {
+    	for (int j = 0; j < 2; ++j) {
+    	  if (DataLayout == ColMajor) {
+    		cout << patch(i, j, k) << " ";
+    	  } else {
+    		cout << patch(k, i, j) << " ";
+    	  }
+    	}
+    	cout << endl;
       }
-      cout << endl;
     }
-  }
 
 This code results in the following output when the data layout is ColMajor:
 
-patch index: 0
-0 1
-4 5
-patch index: 1
-4 5
-8 9
-patch index: 2
-1 2
-5 6
-patch index: 3
-5 6
-9 10
-patch index: 4
-2 3
-6 7
-patch index: 5
-6 7
-10 11
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    4 5
+    8 9
+    patch index: 2
+    1 2
+    5 6
+    patch index: 3
+    5 6
+    9 10
+    patch index: 4
+    2 3
+    6 7
+    patch index: 5
+    6 7
+    10 11
 
 This code results in the following output when the data layout is RowMajor:
 (NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
 
-patch index: 0
-0 1
-4 5
-patch index: 1
-1 2
-5 6
-patch index: 2
-2 3
-6 7
-patch index: 3
-4 5
-8 9
-patch index: 4
-5 6
-9 10
-patch index: 5
-6 7
-10 11
-
-### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const PaddingType padding_type)
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    1 2
+    5 6
+    patch index: 2
+    2 3
+    6 7
+    patch index: 3
+    4 5
+    8 9
+    patch index: 4
+    5 6
+    9 10
+    patch index: 5
+    6 7
+    10 11
+
+### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
 
 Returns a tensor of coefficient image patches extracted from the input tensor,
 which is expected to have dimensions ordered as follows (depending on the data
@@ -1676,28 +1734,30 @@ sizes:
  *) columns: 5
  *) batch:   7
 
-  Tensor<float, 4> tensor(2,3,5,7);
-  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+    Tensor<float, 4> tensor(2,3,5,7);
+    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
 
 2x2 image patches can be extracted and indexed using the following code:
 
 *) 2D patch: ColMajor (patch indexed by second-to-last dimension)
-  Tensor<float, 5> twod_patch;
-  twod_patch = tensor.extract_image_patches<2, 2>();
-  // twod_patch.dimension(0) == 2
-  // twod_patch.dimension(1) == 2
-  // twod_patch.dimension(2) == 2
-  // twod_patch.dimension(3) == 3*5
-  // twod_patch.dimension(4) == 7
+
+    Tensor<float, 5> twod_patch;
+    twod_patch = tensor.extract_image_patches<2, 2>();
+    // twod_patch.dimension(0) == 2
+    // twod_patch.dimension(1) == 2
+    // twod_patch.dimension(2) == 2
+    // twod_patch.dimension(3) == 3*5
+    // twod_patch.dimension(4) == 7
 
 *) 2D patch: RowMajor (patch indexed by the second dimension)
-  Tensor<float, 5, RowMajor> twod_patch_row_major;
-  twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
-  // twod_patch_row_major.dimension(0) == 7
-  // twod_patch_row_major.dimension(1) == 3*5
-  // twod_patch_row_major.dimension(2) == 2
-  // twod_patch_row_major.dimension(3) == 2
-  // twod_patch_row_major.dimension(4) == 2
+
+    Tensor<float, 5, RowMajor> twod_patch_row_major;
+    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+    // twod_patch_row_major.dimension(0) == 7
+    // twod_patch_row_major.dimension(1) == 3*5
+    // twod_patch_row_major.dimension(2) == 2
+    // twod_patch_row_major.dimension(3) == 2
+    // twod_patch_row_major.dimension(4) == 2
 
 ## Special Operations
 
@@ -1737,11 +1797,9 @@ TODO
 
 ## Representation of scalar values
 
-Scalar values are often represented by tensors of size 1 and rank 1. It would be
-more logical and user friendly to use tensors of rank 0 instead. For example
-Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner
-product of 2 1d tensors (through contractions) returns a 1d tensor. In the
-future these operations might be updated to return 0d tensors instead.
+Scalar values are often represented by tensors of size 1 and rank 0.For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 0d tensor.
 
 ## Limitations
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 1940a9692..8cac2bb12 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -23,12 +23,12 @@ namespace Eigen {
   * The %Tensor class encompasses only dynamic-size objects so far.
   *
   * The first two template parameters are required:
-  * \tparam Scalar_ \anchor tensor_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  * \tparam Scalar_  Numeric type, e.g. float, double, int or `std::complex<float>`.
   *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
   * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
   *
   * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam Options_ \anchor tensor_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  * \tparam Options_  A combination of either \b #RowMajor or \b #ColMajor, and of either
   *                 \b #AutoAlign or \b #DontAlign.
   *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
   *                 for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
@@ -42,13 +42,13 @@ namespace Eigen {
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
   *
   * <i><b>Some notes:</b></i>
   *
   * <dl>
   * <dt><b>Relation to other parts of Eigen:</b></dt>
-  * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+  * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
   * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
   * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
   * class does not provide any of these features and is only available as a stand-alone class that just allows for
@@ -112,7 +112,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -388,6 +388,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
+
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
@@ -398,6 +399,20 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
+    #if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor(Self&& other)
+      : m_storage(std::move(other.m_storage))
+    {
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
+    {
+      m_storage = std::move(other.m_storage);
+      return *this;
+    }
+    #endif
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
     {
@@ -462,6 +477,18 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       // Nothing to do: rank 0 tensors have fixed size
     }
 
+#ifdef EIGEN_HAS_INDEX_LIST
+    template <typename FirstType, typename... OtherTypes>
+    EIGEN_DEVICE_FUNC
+    void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+      array<Index, NumIndices> dims;
+      for (int i = 0; i < NumIndices; ++i) {
+        dims[i] = static_cast<Index>(dimensions[i]);
+      }
+      resize(dims);
+    }
+#endif
+
     /** Custom Dimension */
 #ifdef EIGEN_HAS_SFINAE
     template<typename CustomDimension,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index d06f40cd8..8b8fb9235 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -37,7 +37,7 @@ struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
 template<typename XprType>
 struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorIndexTupleOp<XprType>& type;
+  typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename XprType>
@@ -82,28 +82,35 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
 
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   static const int NumDims = internal::array_size<Dimensions>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -117,7 +124,13 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
@@ -147,7 +160,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
 template<typename ReduceOp, typename Dims, typename XprType>
 struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
 {
-  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename ReduceOp, typename Dims, typename XprType>
@@ -172,7 +185,7 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
                                                           const ReduceOp& reduce_op,
-                                                          const int return_dim,
+                                                          const Index return_dim,
                                                           const Dims& reduce_dims)
       : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
 
@@ -187,12 +200,12 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
   const Dims& reduce_dims() const { return m_reduce_dims; }
 
   EIGEN_DEVICE_FUNC
-  int return_dim() const { return m_return_dim; }
+  Index return_dim() const { return m_return_dim; }
 
   protected:
     typename XprType::Nested m_xpr;
     const ReduceOp m_reduce_op;
-    const int m_return_dim;
+    const Index m_return_dim;
     const Dims m_reduce_dims;
 };
 
@@ -209,21 +222,29 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
   static const int NumDims = internal::array_size<InputDimensions>::value;
   typedef array<Index, NumDims> StrideDims;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<TupleType, Device> TupleStorageMem;
 
   enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess = false,
-    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim()) {
-
+        m_return_dim(op.return_dim())
+  {
     gen_strides(m_orig_impl.dimensions(), m_strides);
     if (Layout == static_cast<int>(ColMajor)) {
       const Index total_size = internal::array_prod(m_orig_impl.dimensions());
@@ -231,19 +252,22 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     } else {
       const Index total_size = internal::array_prod(m_orig_impl.dimensions());
       m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
-    }
-    m_stride_div = m_strides[m_return_dim];
+    }    
+    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+    m_stride_div = ((m_return_dim >= 0) &&
+                    (m_return_dim < static_cast<Index>(m_strides.size())))
+                   ? m_strides[m_return_dim] : 1;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -252,7 +276,13 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_orig_impl.bind(cgh);
+  }
+#endif
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -288,7 +318,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
  protected:
   TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
   TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
-  const int m_return_dim;
+  const Index m_return_dim;
   StrideDims m_strides;
   Index m_stride_mod;
   Index m_stride_div;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 166be200c..e5811d63f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -34,6 +34,7 @@ struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
   static const int Layout = internal::traits<LhsXprType>::Layout;
+  typedef typename traits<LhsXprType>::PointerType PointerType;
 
   enum {
     Flags = 0
@@ -67,6 +68,8 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
   typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
 
@@ -94,20 +97,41 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+      RightTensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device) :
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(
+        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+        YOU_MADE_A_PROGRAMMING_MISTAKE);
   }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -118,7 +142,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     return m_rightImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@@ -127,7 +151,19 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     // by the rhs to the lhs.
     return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(
+          m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
@@ -136,6 +172,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+
     const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
     const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
@@ -163,12 +200,41 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
            TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(
+        m_leftImpl.getResourceRequirements(),
+        m_rightImpl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+        m_leftImpl.data() != NULL) {
+      // If destination has raw data access, we pass it as a potential
+      // destination for a block descriptor evaluation.
+      desc.template AddDestinationBuffer<Layout>(
+          /*dst_base=*/m_leftImpl.data() + desc.offset(),
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
+    }
+
+    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
+    // If block was evaluated into a destination, there is no need to do assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      m_leftImpl.writeBlock(desc, block);
+    }
+    block.cleanup();
+  }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+#endif
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
 
  private:
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 7a45a5cf4..35b6458e5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -20,9 +20,11 @@ namespace Eigen {
   * \brief The tensor base class.
   *
   * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangably in expressions.
+  * making it possible to use either class interchangeably in expressions.
   */
-
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME Doxygen does not like the inheritance with different template parameters
+// Since there is no doxygen documentation inside, we disable it for now
 template<typename Derived>
 class TensorBase<Derived, ReadOnlyAccessors>
 {
@@ -133,6 +135,78 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_digamma_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
+    bessel_i0() const {
+      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
+    bessel_i0e() const {
+      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
+    bessel_i1() const {
+      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
+    bessel_i1e() const {
+      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
+    bessel_j0() const {
+      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
+    bessel_y0() const {
+      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
+    bessel_j1() const {
+      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
+    bessel_y1() const {
+      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
+    bessel_k0() const {
+      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
+    bessel_k0e() const {
+      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
+    bessel_k1() const {
+      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
+    bessel_k1e() const {
+      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
+    }
+
     // igamma(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
@@ -140,6 +214,20 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
     }
 
+    // igamma_der_a(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+    igamma_der_a(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+    }
+
+    // gamma_sample_der_alpha(alpha = this, sample = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+    gamma_sample_der_alpha(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+    }
+
     // igammac(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
@@ -174,9 +262,15 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+    ndtri() const {
+      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
     sigmoid() const {
-      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
+      return unaryExpr(internal::scalar_logistic_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
@@ -186,6 +280,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
+    expm1() const {
+      return unaryExpr(internal::scalar_expm1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
     log() const {
       return unaryExpr(internal::scalar_log_op<Scalar>());
@@ -198,15 +298,29 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
+    log2() const {
+      return unaryExpr(internal::scalar_log2_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
       return unaryExpr(internal::scalar_abs_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+    clip(Scalar min, Scalar max) const {
+      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
+                                                             TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+                                                             Derived>::type
     conjugate() const {
-      return unaryExpr(internal::scalar_conjugate_op<Scalar>());
+      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
     }
 
     EIGEN_DEVICE_FUNC
@@ -287,22 +401,27 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
     }
 
+    template <int NanPropagation=PropagateFast>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMax(Scalar threshold) const {
-      return cwiseMax(constant(threshold));
+      return cwiseMax<NanPropagation>(constant(threshold));
     }
 
+    template <int NanPropagation=PropagateFast>
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     cwiseMin(Scalar threshold) const {
-      return cwiseMin(constant(threshold));
+      return cwiseMin<NanPropagation>(constant(threshold));
     }
 
-    template <typename NewType> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
+    template<typename NewType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
+                                                             Derived,
+                                                             TensorConversionOp<NewType, const Derived> >::type
     cast() const {
-      return TensorConversionOp<NewType, const Derived>(derived());
+      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
     }
 
     EIGEN_DEVICE_FUNC
@@ -312,6 +431,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
+    rint() const {
+      return unaryExpr(internal::scalar_rint_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
     ceil() const {
       return unaryExpr(internal::scalar_ceil_op<Scalar>());
@@ -355,16 +480,16 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
     }
 
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
+  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMax(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
+    return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
     }
 
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
+  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
     cwiseMin(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -479,9 +604,15 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
     contract(const OtherDerived& other, const Dimensions& dims) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+    }
+
+    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
     }
 
     // Convolutions.
@@ -494,8 +625,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     // Fourier transforms
     template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
-    fft(const FFT& fft) const {
-      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
+    fft(const FFT& dims) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
     }
 
     // Scan.
@@ -557,51 +688,53 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
     maximum(const Dims& dims) const {
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
     }
 
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
     maximum() const {
       DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
     minimum(const Dims& dims) const {
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
     }
 
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
     minimum() const {
       DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     all(const Dims& dims) const {
       return cast<bool>().reduce(dims, internal::AndReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     all() const {
       DimensionList<Index, NumDimensions> in_dims;
       return cast<bool>().reduce(in_dims, internal::AndReducer());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     any(const Dims& dims) const {
       return cast<bool>().reduce(dims, internal::OrReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
     any() const {
       DimensionList<Index, NumDimensions> in_dims;
       return cast<bool>().reduce(in_dims, internal::OrReducer());
@@ -613,7 +746,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
       const array<Index, NumDimensions>, const Derived>
     argmax() const {
       array<Index, NumDimensions> in_dims;
-      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -626,7 +759,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
       const array<Index, NumDimensions>, const Derived>
     argmin() const {
       array<Index, NumDimensions> in_dims;
-      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -637,7 +770,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorTupleReducerOp<
       internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmax(const int return_dim) const {
+    argmax(const Index return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -650,7 +783,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorTupleReducerOp<
       internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmin(const int return_dim) const {
+    argmin(const Index return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -665,10 +798,22 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
     }
 
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTraceOp<const Dims, const Derived>
+    trace(const Dims& dims) const {
+      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
+    }
+
+    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+    trace() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+    }
+
     template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorBroadcastingOp<const Broadcast, const Derived>
-    broadcast(const Broadcast& broadcast) const {
-      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
+    broadcast(const Broadcast& bcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
     }
 
     template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -776,8 +921,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shuffle) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
     }
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorStridingOp<const Strides, const Derived>
@@ -818,7 +963,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
   protected:
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
@@ -826,6 +972,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
 template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
 class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
+    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
     typedef typename DerivedTraits::Index Index;
@@ -834,7 +981,8 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
@@ -972,13 +1120,13 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shuffle) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shuffle) {
-      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
+    shuffle(const Shuffle& shfl) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
     }
 
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -994,17 +1142,35 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     // Select the device on which to evaluate the expression.
     template <typename DeviceType>
-    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
-      return TensorDevice<Derived, DeviceType>(device, derived());
+    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
+      return TensorDevice<Derived, DeviceType>(dev, derived());
+    }
+
+    // Select the async device on which to evaluate the expression.
+    template <typename DeviceType, typename DoneCallback>
+    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
+      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
     }
 
  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+      Assign assign(derived(), other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return derived();
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
-
+#endif // EIGEN_PARSED_BY_DOXYGEN
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 000000000..1e55d12c4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1559 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const DSizes<IndexType, NumDims>& dimensions) {
+  DSizes<IndexType, NumDims> strides;
+  if (NumDims == 0) return strides;
+
+  // TODO(ezhulenev): Use templates to unroll this loop (similar to
+  // h_array_reduce in CXX11meta.h)? Benchmark it.
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * dimensions[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dimensions[i + 1];
+    }
+  }
+
+  return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+    const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+    const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+//  - kUniformAllDims: 100 blocks of size 10x10
+//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+//                      or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+#ifdef EIGEN_HIPCC
+  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
+  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
+  // errors out complaining about the lack of a matching constructor
+  EIGEN_DEVICE_FUNC
+  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
+				  TensorOpCost cost_)
+    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
+  {}
+#endif
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes,
+      TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+      TensorBlockShapeType shape_type, size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
+                                    size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
+      size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
+                                    size_in_bytes);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+  merge(const TensorBlockResourceRequirements& lhs,
+        const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
+      TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
+  }
+
+  // This is a resource requirement that should be returned from expressions
+  // that do not have any block evaluation preference (e.g. default tensor
+  // expression with raw buffer access).
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
+  }
+
+ private:
+  using Requirements = TensorBlockResourceRequirements;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+    return numext::maxi(lhs_size, rhs_size);
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorBlockShapeType
+  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
+    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
+            rhs == TensorBlockShapeType::kSkewedInnerDims)
+               ? TensorBlockShapeType::kSkewedInnerDims
+               : TensorBlockShapeType::kUniformAllDims;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
+                                                TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  // If we evaluate a Tensor assignment, and expression on the left, already has
+  // a memory buffer, then we might do performance optimization, and evaluate
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
+  //
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
+  class DestinationBuffer {
+   public:
+    enum DestinationBufferKind : int {
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
+      // which is issued for class members with the enum type.
+      // TODO(rocm):
+      // remove the "int" basetype once HIPCC has been fixed to not error out
+      // in the above scenario.
+
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
+    template <typename Scalar>
+    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
+      return static_cast<Scalar*>(m_data);
+    }
+
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }
+
+   private:
+    friend class TensorBlockDescriptor;
+
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+    template <typename Scalar>
+    DestinationBuffer(Scalar* data, const Dimensions& strides,
+                      DestinationBufferKind kind)
+        : m_data(static_cast<void*>(data)),
+          m_data_type_size(sizeof(Scalar)),
+          m_strides(strides),
+          m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc,
+                                  Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+    }
+
+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+                                      const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
+    void* m_data;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
+    Dimensions m_strides;
+
+    DestinationBufferKind m_kind;
+  };
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
+                        const DestinationBuffer& destination)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(destination) {}
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+      : m_offset(offset),
+        m_dimensions(dimensions),
+        m_destination(DestinationBuffer()) {}
+
+  IndexType offset() const { return m_offset; }
+  const Dimensions& dimensions() const { return m_dimensions; }
+  IndexType dimension(int index) const { return m_dimensions[index]; }
+  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
+    m_destination =
+        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+  }
+
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(
+      Scalar* dst_base,
+      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+  }
+
+  TensorBlockDescriptor& DropDestinationBuffer() {
+    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
+    return *this;
+  }
+
+  bool HasDestinationBuffer() const {
+    return m_destination.kind() != DestinationBuffer::kEmpty;
+  }
+
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
+ private:
+  // Offset and dimensions are immutable after construction. Block descriptor
+  // can only be mutated by adding or dropping destination.
+  const IndexType m_offset;
+  const Dimensions m_dimensions;
+  DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  TensorBlockMapper() = default;
+  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
+                    const TensorBlockResourceRequirements& requirements)
+      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+    // Compute block dimensions and the total number of blocks.
+    InitializeBlockDimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
+    return m_total_block_count;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
+    return m_block_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
+  blockDimensions() const {
+    return m_block_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
+  blockDescriptor(IndexType block_index) const {
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    IndexType offset = 0;
+    DSizes<IndexType, NumDims> dimensions;
+
+    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+    // Iterate outer -> inner dimensions.
+    for (int i = NumDims - 1; i >= 0; --i) {
+      const int dim = isColMajor ? i : NumDims - i - 1;
+
+      const IndexType idx = block_index / m_block_strides[dim];
+      block_index -= idx * m_block_strides[dim];
+
+      const IndexType coord = idx * m_block_dimensions[dim];
+      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
+                                     m_block_dimensions[dim]);
+      offset += coord * m_tensor_strides[dim];
+    }
+
+    return {offset, dimensions};
+  }
+
+ private:
+  void InitializeBlockDimensions() {
+    // Requested block shape and size.
+    const TensorBlockShapeType shape_type = m_requirements.shape_type;
+    IndexType target_block_size =
+        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+    IndexType tensor_size = m_tensor_dimensions.TotalSize();
+
+    // Corner case: one of the dimensions is zero. Logic below is too complex
+    // to handle this case on a general basis, just use unit block size.
+    // Note: we must not yield blocks with zero dimensions (recipe for
+    // overflows/underflows, divisions by zero and NaNs later).
+    if (tensor_size == 0) {
+      for (int i = 0; i < NumDims; ++i) {
+        m_block_dimensions[i] = 1;
+      }
+      m_total_block_count = 0;
+      return;
+    }
+
+    // If tensor fits into a target block size, evaluate it as a single block.
+    if (tensor_size <= target_block_size) {
+      m_block_dimensions = m_tensor_dimensions;
+      m_total_block_count = 1;
+      // The only valid block index is `0`, and in this case we do not need
+      // to compute real strides for tensor or blocks (see blockDescriptor).
+      for (int i = 0; i < NumDims; ++i) {
+        m_tensor_strides[i] = 0;
+        m_block_strides[i] = 1;
+      }
+      return;
+    }
+
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    // Block shape skewed towards inner dimension.
+    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+      IndexType coeff_to_allocate = target_block_size;
+
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+        m_block_dimensions[dim] =
+            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+        coeff_to_allocate = divup(
+            coeff_to_allocate,
+            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+      }
+      eigen_assert(coeff_to_allocate == 1);
+
+    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+      // Tensor will not fit within 'target_block_size' budget: calculate tensor
+      // block dimension sizes based on "square" dimension size target.
+      const IndexType dim_size_target = convert_index<IndexType>(
+          std::pow(static_cast<float>(target_block_size),
+                   1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+      for (int i = 0; i < NumDims; ++i) {
+        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // a multiple of the packet size. Note that reducing
+        // 'block_dim_size' in this manner can increase the number of
+        // blocks, and so will amplify any per-block overhead.
+        m_block_dimensions[i] =
+            numext::mini(dim_size_target, m_tensor_dimensions[i]);
+      }
+
+      // Add any un-allocated coefficients to inner dimension(s).
+      IndexType total_size = m_block_dimensions.TotalSize();
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+
+        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+          const IndexType total_size_other_dims =
+              total_size / m_block_dimensions[dim];
+          const IndexType alloc_avail =
+              divup<IndexType>(target_block_size, total_size_other_dims);
+          if (alloc_avail == m_block_dimensions[dim]) {
+            // Insufficient excess coefficients to allocate.
+            break;
+          }
+          m_block_dimensions[dim] =
+              numext::mini(m_tensor_dimensions[dim], alloc_avail);
+          total_size = total_size_other_dims * m_block_dimensions[dim];
+        }
+      }
+
+    } else {
+      eigen_assert(false);  // unknown block shape
+    }
+
+    eigen_assert(m_block_dimensions.TotalSize() >=
+                 numext::mini<IndexType>(target_block_size,
+                                         m_tensor_dimensions.TotalSize()));
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<IndexType, NumDims> block_count;
+    for (int i = 0; i < NumDims; ++i) {
+      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+    m_block_strides = strides<Layout>(block_count);
+  }
+
+  DSizes<IndexType, NumDims> m_tensor_dimensions;
+  TensorBlockResourceRequirements m_requirements;
+
+  DSizes<IndexType, NumDims> m_block_dimensions;
+  IndexType m_total_block_count;
+
+  DSizes<IndexType, NumDims> m_tensor_strides;
+  DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+  explicit TensorBlockScratchAllocator(const Device& device)
+      : m_device(device), m_allocation_index(0) {}
+
+  ~TensorBlockScratchAllocator() {
+    for (size_t i = 0; i < m_allocations.size(); ++i) {
+      m_device.deallocate(m_allocations[i].ptr);
+    }
+  }
+
+  void* allocate(size_t size) {
+    // TODO(ezhulenev): Remove when replaced with inlined vector.
+    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+    // Check if we already have an existing allocation att current index.
+    const int num_allocations = static_cast<int>(m_allocations.size());
+    const bool has_allocation = m_allocation_index < num_allocations;
+
+    // Allocation index can't be larger than the number of allocations.
+    eigen_assert(m_allocation_index <= num_allocations);
+
+    // If we have existing allocation, and its size is larger or equal to
+    // requested size, we do nothing.
+
+    // If current allocation can't fit requested size, we deallocate it, and
+    // replace with a larger allocation.
+    if (has_allocation && m_allocations[m_allocation_index].size < size) {
+      m_device.deallocate(m_allocations[m_allocation_index].ptr);
+      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+      m_allocations[m_allocation_index].size = size;
+    }
+
+    // Make a new allocation if we don't have and existing one.
+    if (!has_allocation) {
+      Allocation allocation;
+      allocation.ptr = m_device.allocate(size);
+      allocation.size = size;
+      m_allocations.push_back(allocation);
+    }
+
+    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+    eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+    return m_allocations[m_allocation_index++].ptr;
+  }
+
+  void reset() { m_allocation_index = 0; }
+
+ private:
+  struct Allocation {
+    void* ptr;
+    size_t size;
+  };
+
+  const Device& m_device;
+  int m_allocation_index;
+  // TODO(ezhulenev): This should be an inlined vector.
+  std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+  // Tensor block that is a lazy expression that must be assigned to a
+  // destination using TensorBlockAssign.
+  kExpr,
+
+  // Tensor block that is a view into a memory buffer owned by an underlying
+  // Tensor expression (e.g. it can be a view into a Tensor buffer).
+  kView,
+
+  // Tensor block that was materialized in a scratch memory buffer, allocated
+  // with TensorBlockScratchAllocator. This block must be copied to a
+  // destination, similar to a block of `kExpr` type.
+  kMaterializedInScratch,
+
+  // Tensor block that was materialized directly into the final output memory
+  // buffer. For example if the left side of an assignment is a Tensor, we can
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
+  kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+  typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+  typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+  typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout,
+          typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
+                          const Dimensions& dimensions, bool valid_expr = true)
+      : m_kind(kind),
+        m_data(data),
+        m_dimensions(dimensions),
+        m_expr(m_data, m_dimensions),
+        m_valid_expr(valid_expr) {
+    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+  }
+
+  TensorBlockKind kind() const { return m_kind; }
+  // NOTE(ezhulenev): Returning XprType by value like in other block types
+  // causes asan failures. The theory is that XprType::Nested doesn't work
+  // properly for TensorMap.
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
+  const Scalar* data() const { return m_data; }
+  void cleanup() {}
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(
+          m_materialized_in_output
+              ? internal::TensorBlockKind::kMaterializedInOutput
+              : internal::TensorBlockKind::kMaterializedInScratch,
+          m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock;
+
+    Storage(Scalar* data, const Dimensions& dimensions,
+            const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch,
+      bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+               allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+                     internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+      const Scalar* data, const DataDimensions& data_dims,
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+                                     block_start, desc.dimensions());
+
+    } else {
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
+          TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+                           data, desc.offset());
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+                           storage.data());
+
+      TensorBlockIO::Copy(dst, src);
+      return storage.AsTensorMaterializedBlock();
+    }
+  }
+
+ private:
+  TensorBlockKind m_kind;
+  const Scalar* m_data;
+  Dimensions m_dimensions;
+  XprType m_expr;
+  bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
+      type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+      : m_arg_block(arg_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+  static const bool NoArgBlockAccess =
+      internal::is_void<typename LhsTensorBlock::XprType>::value ||
+      internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
+                          const typename RhsTensorBlock::XprType> >::type
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
+                         const RhsTensorBlock& right_block,
+                         const BinaryOp& functor)
+      : m_left_block(left_block),
+        m_right_block(right_block),
+        m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const {
+    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
+  }
+
+  const Scalar* data() const { return NULL; }
+
+  void cleanup() {
+    m_left_block.cleanup();
+    m_right_block.cleanup();
+  }
+
+ private:
+  LhsTensorBlock m_left_block;
+  RhsTensorBlock m_right_block;
+  BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+                       const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+          typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                       internal::is_void<Arg2XprType>::value ||
+                                       internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef typename conditional<
+      NoArgBlockAccess, void,
+      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+                                              Arg3XprType>::type>::type XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+                         const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block,
+                         const BlockFactory& factory)
+      : m_arg1_block(arg1_block),
+        m_arg2_block(arg2_block),
+        m_arg3_block(arg3_block),
+        m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const {
+    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+                          m_arg3_block.expr());
+  }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+ public:
+  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+  enum class Kind {
+    Linear = 0,       // src_stride == 1 && dst_stride == 1
+    Scatter = 1,      // src_stride == 1 && dst_stride != 1
+    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
+    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
+    Gather = 4,       // dst_stride == 1
+    Random = 5        // everything else
+  };
+
+  struct Dst {
+    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    Scalar* data;
+  };
+
+  struct Src {
+    Src(IndexType o, IndexType s, const Scalar* d)
+        : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    const Scalar* data;
+  };
+
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
+                                                        const Src& src,
+                                                        const size_t count) {
+    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
+              src.data);
+  }
+
+ private:
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const IndexType count, const IndexType dst_offset,
+      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+      const IndexType src_offset, const IndexType src_stride,
+      const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src = &src_data[src_offset];
+    Scalar* dst = &dst_data[dst_offset];
+
+    if (!Vectorizable) {
+      for (Index i = 0; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+      return;
+    }
+
+    const IndexType vectorized_size = count - PacketSize;
+    IndexType i = 0;
+
+    if (kind == StridedLinearBufferCopy::Kind::Linear) {
+      // ******************************************************************** //
+      // Linear copy from `src` to `dst`.
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      eigen_assert(src_stride == 1 && dst_stride == 1);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          Packet p = ploadu<Packet>(src + i + j * PacketSize);
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
+      // Scatter from `src` to `dst`.
+      eigen_assert(src_stride == 1 && dst_stride != 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
+      // Fill `dst` with value at `*src`.
+      eigen_assert(src_stride == 0 && dst_stride == 1);
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      Packet p = pload1<Packet>(src);
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i <= vectorized_size; i += PacketSize) {
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
+      // Scatter `*src` into `dst`.
+      eigen_assert(src_stride == 0 && dst_stride != 1);
+      Packet p = pload1<Packet>(src);
+      for (; i <= vectorized_size; i += PacketSize) {
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = *src;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
+      // Gather from `src` into `dst`.
+      eigen_assert(dst_stride == 1);
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i * src_stride];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
+      // Random.
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+    } else {
+      eigen_assert(false);
+    }
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+  static const bool IsColMajor = (Layout == ColMajor);
+
+  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef DSizes<int, NumDims> DimensionsMap;
+
+  struct Dst {
+    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
+        IndexType dst_offset = 0)
+        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  struct Src {
+    Src(const Dimensions& src_strides, const Scalar* src,
+        IndexType src_offset = 0)
+        : strides(src_strides), data(src), offset(src_offset) {}
+
+    Dimensions strides;
+    const Scalar* data;
+    IndexType offset;
+  };
+
+  // Copies data to `dst` from `src`, using provided dimensions mapping:
+  //
+  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+  //
+  // Returns the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
+      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
+    // Copy single scalar value from `src` to `dst`.
+    if (NumDims == 0) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Both `dst` and `src` must have contiguous innermost dimension. We also
+    // accept the special case with stride '0', because it's used as a trick to
+    // implement broadcasting.
+    {
+      int inner_dim = IsColMajor ? 0 : NumDims - 1;
+      EIGEN_UNUSED_VARIABLE(inner_dim);
+      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+    }
+
+    // Give a shorter name to `dst_to_src_dim_map`.
+    const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+    // Do not squeeze reordered inner dimensions.
+    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+    // block, and we write data linearly into that dimension, reading it from
+    // the src. If dimensions are reordered, we might end up reading data from
+    // the src with `stride != 1`.
+    //
+    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+    // Find the innermost dimension in the dst whose size is not 1. This is the
+    // effective inner dim.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      if (dst.dims[dst_dim] != 1) break;
+      num_size_one_inner_dims++;
+    }
+
+    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+    if (num_size_one_inner_dims == NumDims) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+    const int dst_stride1_dim = IsColMajor
+                                    ? num_size_one_inner_dims
+                                    : NumDims - num_size_one_inner_dims - 1;
+
+    // Dimension in the src that corresponds to the dst innermost dimension.
+    const int src_dim_for_dst_stride1_dim =
+        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+    // Size of the innermost dimension (length of contiguous blocks of memory).
+    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+    // `src` memory, so we can do less linear copy calls.
+    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      const IndexType dst_stride = dst.strides[dst_dim];
+      const IndexType src_stride = src.strides[dim_map[dst_dim]];
+      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+        dst_inner_dim_size *= dst.dims[dst_dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    // Setup strides to read data from `src` and write to `dst`.
+    IndexType input_offset = src.offset;
+    IndexType output_offset = dst.offset;
+    IndexType input_stride =
+        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> it;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int idx = 0;  // currently initialized iterator state index
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      if (dst.dims[dst_dim] == 1) continue;
+
+      it[idx].size = dst.dims[dst_dim];
+      it[idx].input_stride = src.strides[dim_map[dst_dim]];
+      it[idx].output_stride = dst.strides[dst_dim];
+
+      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+      idx++;
+    }
+
+    // Iterate copying data from src to dst.
+    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND)                                           \
+  IndexType num_copied = 0;                                            \
+  for (num_copied = 0; num_copied < block_total_size;                  \
+       num_copied += dst_inner_dim_size) {                             \
+    LinCopy::template Run<KIND>(                                       \
+        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
+        typename LinCopy::Src(input_offset, input_stride, src.data),   \
+        dst_inner_dim_size);                                           \
+                                                                       \
+    for (int j = 0; j < idx; ++j) {                                    \
+      if (++it[j].count < it[j].size) {                                \
+        input_offset += it[j].input_stride;                            \
+        output_offset += it[j].output_stride;                          \
+        break;                                                         \
+      }                                                                \
+      it[j].count = 0;                                                 \
+      input_offset -= it[j].input_span;                                \
+      output_offset -= it[j].output_span;                              \
+    }                                                                  \
+  }                                                                    \
+  return num_copied;
+
+    if (input_stride == 1 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Linear);
+    } else if (input_stride == 1 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Scatter);
+    } else if (input_stride == 0 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
+    } else if (input_stride == 0 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
+    } else if (output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Gather);
+    } else {
+      COPY_INNER_DIM(LinCopy::Kind::Random);
+    }
+
+#undef COPY_INNER_DIM
+  }
+
+  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+  // the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
+                                                              const Src& src) {
+    DimensionsMap dst_to_src_map;
+    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+    return Copy(dst, src, dst_to_src_map);
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          input_stride(0),
+          output_stride(0),
+          input_span(0),
+          output_span(0) {}
+
+    IndexType size;
+    IndexType count;
+    IndexType input_stride;
+    IndexType output_stride;
+    IndexType input_span;
+    IndexType output_span;
+  };
+
+  // Compute how many inner dimensions it's allowed to squeeze when doing IO
+  // between two tensor blocks. It's safe to squeeze inner dimensions, only
+  // if they are not reordered.
+  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+    int num_squeezable_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      if (dim_map[dim] != dim) break;
+      num_squeezable_dims++;
+    }
+    return num_squeezable_dims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr,
+          typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+  // We will use coeff/packet path to evaluate block expressions.
+  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
+      TensorBlockEvaluator;
+
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size
+  };
+
+  template <bool Vectorizable, typename Evaluator>
+  struct InnerDimAssign {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      for (IndexType i = 0; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+  template <typename Evaluator>
+  struct InnerDimAssign<true, Evaluator> {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+                                        const Evaluator& eval,
+                                        IndexType eval_offset) {
+      typedef typename packet_traits<Scalar>::type Packet;
+
+      const IndexType unrolled_size = count - 4 * PacketSize;
+      const IndexType vectorized_size = count - PacketSize;
+      IndexType i = 0;
+
+      for (; i <= unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          const IndexType idx = eval_offset + i + j * PacketSize;
+          Packet p = eval.template packet<Unaligned>(idx);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
+        }
+      }
+
+      for (; i <= vectorized_size; i += PacketSize) {
+        Packet p = eval.template packet<Unaligned>(eval_offset + i);
+        pstoreu<Scalar>(target + i, p);
+      }
+
+      for (; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+ public:
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides,
+           Scalar* target_data, IndexType target_offset = 0)
+        : dims(target_dims),
+          strides(target_strides),
+          data(target_data),
+          offset(target_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  static Target target(const Dimensions& target_dims,
+                       const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(
+      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+      Scalar* target_data, IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides),
+                  target_data, target_offset);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Target& target, const TensorBlockExpr& expr) {
+    // Prepare evaluator for block expression.
+    DefaultDevice default_device;
+    TensorBlockEvaluator eval(expr, default_device);
+
+    // Tensor block expression dimension should match destination dimensions.
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+    static const int Layout = TensorBlockEvaluator::Layout;
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Initialize output inner dimension size based on a layout.
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
+
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
+    IndexType num_squeezed_dims = 0;
+    for (Index i = 1; i < NumDims; ++i) {
+      const Index dim = is_col_major ? i : NumDims - i - 1;
+      const IndexType target_stride = target.strides[dim];
+
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
+        num_squeezed_dims++;
+      } else {
+        break;
+      }
+    }
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+
+    int idx = 0;  // currently initialized iterator state index
+    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+      it[idx].count = 0;
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // We read block expression from the beginning, and start writing data to
+    // `target` at given offset.
+    IndexType input_offset = 0;
+    IndexType output_offset = target.offset;
+
+    // Iterate copying data from `eval` to `target`.
+    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+      // Assign to `target` at current offset.
+      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
+                     TensorBlockEvaluator>::Run(target.data + output_offset,
+                                                output_inner_dim_size, eval,
+                                                input_offset);
+
+      // Move input offset forward by the number of assigned coefficients.
+      input_offset += output_inner_dim_size;
+
+      // Update index.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0), size(0), output_stride(0), output_span(0) {}
+
+    IndexType count;
+    IndexType size;
+    IndexType output_stride;
+    IndexType output_span;
+  };
+};
+
+// -------------------------------------------------------------------------- //
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 4cfe300eb..a354132f6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -31,12 +31,13 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Broadcast, typename XprType>
 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
 {
-  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
+  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
 };
 
 template<typename Broadcast, typename XprType>
@@ -54,7 +55,7 @@ struct is_input_scalar<Sizes<> > {
   static const bool value = true;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::size_t... Indices>
+template <typename std::ptrdiff_t... Indices>
 struct is_input_scalar<Sizes<Indices...> > {
   static const bool value = (Sizes<Indices...>::total_size == 1);
 };
@@ -103,27 +104,57 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
+  bool isCopy, nByOne, oneByN;
+  public:
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_broadcast(op.broadcast()),m_impl(op.expression(), device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
+  // strides. See block method implementation for details.
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : isCopy(false), nByOne(false), oneByN(false),
+        m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
   {
+
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
-    const Broadcast& broadcast = op.broadcast();
+    isCopy = true;
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i] * broadcast[i];
+      m_dimensions[i] = input_dims[i] * m_broadcast[i];
+      if (m_broadcast[i] != 1) {
+        isCopy = false;
+      }
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -141,16 +172,58 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
       }
     }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (m_broadcast[i] != 1) {
+          oneByN = false;
+          break;
+        }
+      }
+    } else if (input_dims[NumDims-1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims-1; ++i) {
+        if (m_broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims-1; ++i) {
+          if (m_broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
+        }
+      }
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -161,16 +234,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return coeffColMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffColMajor(index);
+      }
     } else {
-      return coeffRowMajor(index);
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffRowMajor(index);
+      }
     }
   }
 
   // TODO: attempt to speed this up. The integer divisions and modulo are slow
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
-  {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -195,12 +276,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         inputIndex += (index % m_impl.dimensions()[0]);
       }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
   {
+    return m_impl.coeff(indexColMajor(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -215,17 +301,22 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
       index -= idx * m_outputStrides[i];
     }
-    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
-      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
       inputIndex += index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
-        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
       } else {
-        inputIndex += (index % m_impl.dimensions()[NumDims-1]);
+        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
       }
     }
-    return m_impl.coeff(inputIndex);
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+  {
+    return m_impl.coeff(indexRowMajor(index));
   }
 
   template<int LoadMode>
@@ -236,9 +327,148 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return packetColMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
     } else {
-      return packetRowMajor<LoadMode>(index);
+      if (isCopy) {
+        #ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+        #else
+        return m_impl.template packet<LoadMode>(index);
+        #endif
+      } else if (oneByN && !nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
+  (Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex   = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    Index dim, inputIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = NumDims - 1;
+    } else {
+      dim = 0;
+    }
+
+    inputIndex = index % m_inputStrides[dim];
+    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > m_inputStrides[dim]-1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index dim, inputIndex, outputOffset;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = 1;
+    } else {
+      dim = NumDims - 2;
+    }
+
+    inputIndex   = index / m_outputStrides[dim];
+    outputOffset = index % m_outputStrides[dim];
+    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[dim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
     }
   }
 
@@ -253,6 +483,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     const Index originalIndex = index;
 
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -288,8 +519,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffColMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffColMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -305,6 +541,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     const Index originalIndex = index;
 
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -340,8 +577,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffRowMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -351,7 +593,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     double compute_cost = TensorOpCost::AddCost<Index>();
-    if (NumDims > 0) {
+    if (!isCopy && NumDims > 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
         if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -372,14 +615,472 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+    // tensors. But this might need further tuning.
+    const size_t target_size = m_device.firstLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        m_impl.getResourceRequirements(),
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    BlockBroadcastingParams params = blockBroadcastingParams(desc);
+
+    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
+      return emptyBlock();
+    }
+
+    // Prepare storage for the materialized broadcasting result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+    ScalarNoConst* materialized_output = block_storage.data();
+
+    // We potentially will need to materialize input blocks.
+    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;
+
+    // Initialize block broadcating iterator state for outer dimensions (outer
+    // with regard to bcast dimension). Dimension in this array are always in
+    // inner_most -> outer_most order (col major layout).
+    array<BlockBroadcastingIteratorState, NumDims> it;
+    int idx = 0;
+
+    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
+      const Index dim = IsColMajor ? i : NumDims - 1 - i;
+      it[idx].size = params.output_dims[dim];
+      it[idx].count = 0;
+      it[idx].output_stride = m_outputStrides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // Write output into the beginning of `materialized_output`.
+    Index output_offset = 0;
+
+    // We will fill output block by broadcasting along the bcast dim, and
+    // iterating over outer dimension.
+    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
+
+    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
+      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
+      Index bcast_offset = desc.offset() + output_offset;
+
+      // Broadcast along the bcast dimension.
+      num_output_coeffs += BroadcastBlockAlongBcastDim(
+          params, bcast_offset, scratch, bcast_output, &materialized_input,
+          &materialized_input_size);
+
+      // Switch to the next outer dimension.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
   Broadcast functor() const { return m_broadcast; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
+      cl::sycl::handler& cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+ private:
+  static const bool IsColMajor =
+      static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+  // We will build a general case block broadcasting on top of broadcasting
+  // primitive that will do broadcasting only for the inner dimension(s) along
+  // the first dimension smaller than the input size (it's called `bcast_dim`).
+  //
+  // Example:
+  //           dim:  0  1  2   (ColMajor)
+  //    input size: [9, 3, 6]
+  //    block size: [9, 2, 6]
+  //
+  // We will compute broadcasted block by iterating over the outer dimensions
+  // before `bcast_dim` (only dimension `2` in this example) and computing
+  // broadcasts along the `bcast_dim` (dimension `1` in this example).
+
+  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
+  // single block along the broadcasting dimension. Sizes and strides along the
+  // `bcast_dim` might be invalid, they will be adjusted later in
+  // `BroadcastBlockAlongBcastDim`.
+  struct BlockBroadcastingParams {
+    Dimensions input_dims;      // input expression dimensions
+    Dimensions output_dims;     // output block sizes
+    Dimensions output_strides;  // output block strides
+
+    int inner_dim_count;   // count inner dimensions matching in size
+    int bcast_dim;         // broadcasting dimension index
+    Index bcast_dim_size;  // broadcasting dimension size
+    Index inner_dim_size;  // inner dimensions size
+
+    // Block sizes and strides for the input block where all dimensions before
+    // `bcast_dim` are equal to `1`.
+    Dimensions input_block_sizes;
+    Dimensions input_block_strides;
+
+    // Block sizes and strides for blocks with extra dimensions and strides `0`.
+    BroadcastDimensions bcast_block_sizes;
+    BroadcastDimensions bcast_block_strides;
+    BroadcastDimensions bcast_input_strides;
+  };
+
+  struct BlockBroadcastingIteratorState {
+    Index size;
+    Index count;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
+  blockBroadcastingParams(TensorBlockDesc& desc) const {
+    BlockBroadcastingParams params;
+
+    params.input_dims = Dimensions(m_impl.dimensions());
+
+    // Output block sizes and strides.
+    params.output_dims = desc.dimensions();
+    params.output_strides = internal::strides<Layout>(params.output_dims);
+
+    // Find the broadcasting dimension (first dimension with output size smaller
+    // that the input size).
+    params.bcast_dim = 0;
+    params.bcast_dim_size = 1;
+    params.inner_dim_size = 1;
+
+    // Count the number of inner dimensions that have the same size in the block
+    // and in the broadcast expression.
+    params.inner_dim_count = 0;
+
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      if (params.output_dims[dim] == m_dimensions[dim]) {
+        params.inner_dim_size *= params.output_dims[dim];
+        ++params.inner_dim_count;
+        continue;
+      }
+
+      // First non-matching dimension is the broadcasting dimension.
+      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
+      params.bcast_dim = dim;
+      params.bcast_dim_size = params.output_dims[dim];
+      break;
+    }
+
+    // Calculate the input block size for looking into the input.
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = params.input_dims[dim];
+    }
+    for (int i = params.inner_dim_count; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = 1;
+    }
+    params.input_block_strides =
+        internal::strides<Layout>(params.input_block_sizes);
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    //
+    // - bcast_block_sizes:
+    //   [d_0, b_0, d_1, b_1, ...]
+    //
+    // - bcast_block_strides:
+    //   [output_block_strides[0], output_block_strides[0] * d_0,
+    //    output_block_strides[1], output_block_strides[1] * d_1,
+    //   ...]
+    //
+    // - bcast_input_strides:
+    //   [input_block_strides[0], 0,
+    //    input_block_strides[1], 0,
+    //   ...].
+    //
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
+
+      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
+      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
+      params.bcast_block_strides[broadcast_dim] =
+          params.output_strides[dim] * params.input_dims[dim];
+      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
+      params.bcast_input_strides[broadcast_dim] = 0;
+    }
+
+    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
+      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
+      params.bcast_block_sizes[dim] = 1;
+      params.bcast_block_strides[dim] = 0;
+      params.bcast_input_strides[dim] = 0;
+    }
+
+    return params;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
+      BlockBroadcastingParams params, Index bcast_offset,
+      TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
+      ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    if (params.bcast_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      return BroadcastBlock(
+          params.input_block_sizes, params.input_block_strides,
+          params.bcast_block_sizes, params.bcast_block_strides,
+          params.bcast_input_strides, bcast_offset, 0, scratch,
+          materialized_output, materialized_input, materialized_input_size);
+
+    } else if (params.input_dims[params.bcast_dim] == 1) {
+      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
+      const int broadcast_bcast_dim =
+          IsColMajor ? 2 * params.inner_dim_count + 1
+                     : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
+      params.bcast_input_strides[broadcast_bcast_dim] = 0;
+      params.bcast_block_strides[broadcast_bcast_dim] =
+          params.output_strides[params.bcast_dim];
+
+      return BroadcastBlock(
+          params.input_block_sizes, params.input_block_strides,
+          params.bcast_block_sizes, params.bcast_block_strides,
+          params.bcast_input_strides, bcast_offset, 0, scratch,
+          materialized_output, materialized_input, materialized_input_size);
+
+    } else {
+      // Keep track of the total number of the coefficients written to the
+      // output block.
+      Index num_output_coeffs = 0;
+
+      // The general case. Let's denote the output block as
+      //
+      //   x[..., a:a+bcast_dim_size, :, ..., :]
+      //
+      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
+      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
+      // sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
+      //     in [a, a+bcast_dim_size].
+      //
+      // (3) c:a+bcast_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index bcast_dim_left_index =
+          bcast_offset / m_outputStrides[params.bcast_dim];
+
+      // Find b and c.
+      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
+
+      // First multiple after a. This is b when <= bcast_dim_left_index +
+      // bcast_dim_size.
+      const Index first_multiple =
+          divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
+          input_bcast_dim_size;
+
+      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple =
+            (bcast_dim_left_index + params.bcast_dim_size) /
+            input_bcast_dim_size * input_bcast_dim_size;
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count
+                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        const int broadcast_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count + 1
+                       : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+        if (first_multiple > bcast_dim_left_index) {
+          const Index head_size = first_multiple - bcast_dim_left_index;
+          params.input_block_sizes[params.bcast_dim] = head_size;
+          params.bcast_block_sizes[copy_bcast_dim] = head_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, 0, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+        if (first_multiple < last_multiple) {
+          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
+          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] =
+              (last_multiple - first_multiple) / input_bcast_dim_size;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+          const Index offset = (first_multiple - bcast_dim_left_index) *
+                               m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, offset, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
+          const Index tail_size =
+              bcast_dim_left_index + params.bcast_dim_size - last_multiple;
+          params.input_block_sizes[params.bcast_dim] = tail_size;
+          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
+          params.bcast_input_strides[copy_bcast_dim] =
+              params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] =
+              params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] *
+              params.input_dims[params.bcast_dim];
+          const Index offset = (last_multiple - bcast_dim_left_index) *
+                               m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs += BroadcastBlock(
+              params.input_block_sizes, params.input_block_strides,
+              params.bcast_block_sizes, params.bcast_block_strides,
+              params.bcast_input_strides, bcast_offset, offset, scratch,
+              materialized_output, materialized_input, materialized_input_size);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count
+                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
+        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
+        params.bcast_input_strides[copy_bcast_dim] =
+            params.input_block_strides[params.bcast_dim];
+        params.bcast_block_strides[copy_bcast_dim] =
+            params.output_strides[params.bcast_dim];
+
+        num_output_coeffs += BroadcastBlock(
+            params.input_block_sizes, params.input_block_strides,
+            params.bcast_block_sizes, params.bcast_block_strides,
+            params.bcast_input_strides, bcast_offset, 0, scratch,
+            materialized_output, materialized_input, materialized_input_size);
+      }
+
+      return num_output_coeffs;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
+      const Dimensions& input_block_sizes,
+      const Dimensions& input_block_strides,
+      const BroadcastDimensions& bcast_block_sizes,
+      const BroadcastDimensions& bcast_block_strides,
+      const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
+      Index offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
+      size_t* materialized_input_size) const {
+    // ---------------------------------------------------------------------- //
+    // Tensor block descriptor for reading block from the input.
+    const Index input_offset = bcast_offset + offset;
+    TensorBlockDesc input_desc(
+        IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
+        input_block_sizes);
+
+    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
+
+    // ---------------------------------------------------------------------- //
+    // Materialize input block into a temporary memory buffer only if it's not
+    // already available in the arg block.
+    const ScalarNoConst* input_buffer = NULL;
+
+    if (input_block.data() != NULL) {
+      // Input block already has raw data, there is no need to materialize it.
+      input_buffer = input_block.data();
+
+    } else {
+      // Otherwise we have to do block assignment into a temporary buffer.
+
+      // Maybe reuse previously allocated buffer, or allocate a new one with a
+      // scratch allocator.
+      const size_t input_total_size = input_block_sizes.TotalSize();
+      if (*materialized_input == NULL ||
+          *materialized_input_size < input_total_size) {
+        *materialized_input_size = input_total_size;
+        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+        *materialized_input = static_cast<ScalarNoConst*>(mem);
+      }
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(input_block_sizes, input_block_strides,
+                                        *materialized_input),
+          input_block.expr());
+
+      input_buffer = *materialized_input;
+    }
+
+    // ---------------------------------------------------------------------- //
+    // Copy data from materialized input block to the materialized output, using
+    // given broadcast strides (strides with zeroes).
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
+        TensorBlockIO;
+
+    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
+                                      materialized_output + offset);
+
+    return TensorBlockIO::Copy(dst, src);
+  }
 
- protected:
-  const Broadcast m_broadcast;
+protected:
+  const Device EIGEN_DEVICE_REF m_device;
+  const typename internal::remove_reference<Broadcast>::type m_broadcast;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 1ba7ef170..376457341 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -32,12 +32,13 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions - 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex DimId, typename XprType>
 struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
 {
-  typedef const TensorChippingOp<DimId, XprType>& type;
+  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
 };
 
 template<DenseIndex DimId, typename XprType>
@@ -50,6 +51,7 @@ template <DenseIndex DimId>
 struct DimensionId
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    EIGEN_UNUSED_VARIABLE(dim);
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -78,44 +80,28 @@ template<DenseIndex DimId, typename XprType>
 class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+    typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+        : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
-      : m_xpr(expr), m_offset(offset), m_dim(dim) {
-  }
+    EIGEN_DEVICE_FUNC
+    const Index offset() const { return m_offset; }
+    EIGEN_DEVICE_FUNC
+    const Index dim() const { return m_dim.actualDim(); }
 
-  EIGEN_DEVICE_FUNC
-  const Index offset() const { return m_offset; }
-  EIGEN_DEVICE_FUNC
-  const Index dim() const { return m_dim.actualDim(); }
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
 
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
-
-  template<typename OtherDerived>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
-  {
-    typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
-    Assign assign(*this, other);
-    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    return *this;
-  }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -136,20 +122,49 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    // Chipping of outer-most dimension is a trivial operation, because we can
+    // read and write directly from the underlying tensor using single offset.
+    IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
+                        (static_cast<int>(Layout) == RowMajor && DimId == 0),
+    // Chipping inner-most dimension.
+    IsInnerChipping   = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
+                        (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
+    // Prefer block access if the underlying expression prefers it, otherwise
+    // only if chipping is not trivial.
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
+                        !IsOuterChipping,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockDescriptor<NumInputDims, Index>
+      ArgTensorBlockDesc;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -185,12 +200,12 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -205,21 +220,20 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+    if (isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       Index inputIndex = index * m_inputStride + m_inputOffset;
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         values[i] = m_impl.coeff(inputIndex);
         inputIndex += m_inputStride;
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
-	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       return m_impl.template packet<LoadMode>(index + m_inputOffset);
     } else {
@@ -231,6 +245,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       } else {
         // Cross the stride boundary. Fallback to slow path.
         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+       EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index);
           ++index;
@@ -263,29 +278,100 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
            TensorOpCost(0, 0, cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
-    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
-    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
-        result) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    const Index chip_dim = m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i]
+            = i < chip_dim ? desc.dimension(i)
+            : i > chip_dim ? desc.dimension(i - 1)
+            : 1;
+    }
+
+    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+    // Try to reuse destination buffer for materializing argument block.
+    if (desc.HasDestinationBuffer()) {
+      DSizes<Index, NumInputDims> arg_destination_strides;
+      for (int i = 0; i < NumInputDims; ++i) {
+      arg_destination_strides[i]
+            = i < chip_dim ? desc.destination().strides()[i]
+            : i > chip_dim ? desc.destination().strides()[i - 1]
+            : 0; // for dimensions of size `1` stride should never be used.
+      }
+
+      arg_desc.template AddDestinationBuffer<Layout>(
+          desc.destination().template data<ScalarNoConst>(),
+          arg_destination_strides);
+    }
+
+    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+
+    if (arg_block.data() != NULL) {
+      // Forward argument block buffer if possible.
+      return TensorBlock(arg_block.kind(), arg_block.data(),
+                           desc.dimensions());
+
+    } else {
+      // Assign argument block expression to a buffer.
+
+      // Prepare storage for the materialized chipping result.
+      const typename TensorBlock::Storage block_storage =
+          TensorBlock::prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              arg_desc.dimensions(),
+              internal::strides<Layout>(arg_desc.dimensions()),
+              block_storage.data()),
+          arg_block.expr());
+
+      return block_storage.AsTensorMaterializedBlock();
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (isOuterChipping() && result) {
       return result + m_inputOffset;
     } else {
       return NULL;
     }
   }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex;
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+    if (isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
-	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer
+      // division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
     } else {
@@ -297,13 +383,25 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     return inputIndex;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
+    return IsInnerChipping ||
+           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
+           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
+    return IsOuterChipping ||
+           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
+           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
+  }
+
   Dimensions m_dimensions;
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
@@ -321,15 +419,21 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    RawAccess = false
+    IsAligned     = false,
+    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess   = TensorEvaluator<ArgType, Device>::RawAccess,
+    Layout        = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess     = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
@@ -343,20 +447,19 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
-	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+    if (this->isInnerChipping()) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         this->m_impl.coeffRef(inputIndex) = values[i];
         inputIndex += this->m_inputStride;
       }
-    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
-	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
-      // m_stride is aways greater than index, so let's avoid the integer division.
+    } else if (this->isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
       eigen_assert(this->m_stride > index);
       this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
     } else {
@@ -369,6 +472,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
         // Cross stride boundary. Fallback to slow path.
         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
         internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           this->coeffRef(index) = values[i];
           ++index;
@@ -376,6 +480,36 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
       }
     }
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
+
+    const Index chip_dim = this->m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+                          : i > chip_dim ? desc.dimension(i - 1)
+                          : 1;
+    }
+
+    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
+                              const typename TensorBlock::XprType>
+        TensorBlockExpr;
+
+    typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
+                                            TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(
+            input_block_dims,
+            internal::strides<Layout>(this->m_impl.dimensions()),
+            this->m_impl.data(), this->srcCoeff(desc.offset())),
+        block.expr().reshape(input_block_dims));
+  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 59bf90d93..5235a8e6f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -37,6 +37,8 @@ struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
   enum { Flags = 0 };
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                               typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename Axis, typename LhsXprType, typename RhsXprType>
@@ -58,6 +60,7 @@ template<typename Axis, typename LhsXprType, typename RhsXprType>
 class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
 {
   public:
+    typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
     typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
     typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
     typedef typename internal::traits<TensorConcatenationOp>::Index Index;
@@ -79,25 +82,7 @@ class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsX
 
     EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const TensorConcatenationOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorConcatenationOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
@@ -117,14 +102,24 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -177,14 +172,14 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
   {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -215,11 +210,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
       Index left_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         left_index = subs[0];
+        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
       } else {
         left_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
@@ -231,11 +228,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
       Index right_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         right_index = subs[0];
+        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
       } else {
         right_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
@@ -248,11 +247,12 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -275,7 +275,15 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
            TensorOpCost(0, 0, compute_cost);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+  #endif
 
   protected:
     Dimensions m_dimensions;
@@ -296,13 +304,21 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
   typedef typename Base::Dimensions Dimensions;
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+                        TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
     : Base(op, device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -344,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 20b29e5fd..8b35f7985 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -21,8 +21,8 @@ namespace Eigen {
   */
 namespace internal {
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
@@ -38,53 +38,305 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                               typename traits<LhsXprType>::PointerType,
+                               typename traits<RhsXprType>::PointerType>::type
+      PointerType;
 
   enum {
     Flags = 0
   };
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
 {
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType>
-struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
 {
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
 };
 
-template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
-struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
   typedef Indices_ Indices;
   typedef LeftArgType_ LeftArgType;
   typedef RightArgType_ RightArgType;
+  typedef OutputKernelType_ OutputKernelType;
   typedef Device_ Device;
 
   // From NumDims below.
   static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
+// Helper class to allocate and deallocate temporary memory for packed buffers.
+template <typename LhsScalar, typename RhsScalar>
+struct TensorContractionBlockMemAllocator {
+  typedef void* BlockMemHandle;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm,
+                                                   const Index bk,
+                                                   const Index bn,
+                                                   LhsScalar** lhs_block,
+                                                   RhsScalar** rhs_block) {
+    eigen_assert(lhs_block);
+    eigen_assert(rhs_block);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
+    eigen_assert(block_mem);
+    *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
+    *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(
+      Device& d, const Index bm, const Index bk, const Index bn,
+      const Index num_lhs, const Index num_rhs, const Index num_slices,
+      std::vector<LhsScalar*>* lhs_blocks,
+      std::vector<RhsScalar*>* rhs_blocks) {
+    eigen_assert(num_slices > 0);
+    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
+    eigen_assert(num_lhs == 0 || lhs_blocks);
+    eigen_assert(num_rhs == 0 || rhs_blocks);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    void* block_mem = d.allocate(
+        (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
+    eigen_assert(block_mem);
+    char* mem = static_cast<char*>(block_mem);
+
+    for (Index x = 0; x < num_slices; x++) {
+      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
+      for (Index m = 0; m < num_lhs; m++) {
+        lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
+        mem += sz.lhs_size;
+      }
+      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
+      for (Index n = 0; n < num_rhs; n++) {
+        rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
+        mem += sz.rhs_size;
+      }
+    }
+
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    d.deallocate(handle);
+  }
+
+ private:
+  struct BlockSizes {
+    Index lhs_size;
+    Index rhs_size;
+  };
+  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm,
+                                                              const Index bk,
+                                                              const Index bn) {
+    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+    BlockSizes sz;
+    sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+    sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+    return sz;
+  }
+};
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+template <typename ResScalar, typename LhsScalar, typename RhsScalar,
+    typename StorageIndex, typename OutputMapper, typename LhsMapper,
+    typename RhsMapper>
+struct TensorContractionKernel {
+  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
+  // (otherwise beta should be always equal to 1).
+  enum { HasBeta = false };
+
+  EIGEN_DEVICE_FUNC
+  TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_,
+                          StorageIndex bm_, StorageIndex bk_, StorageIndex bn_)
+      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
+
+  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
+  typedef LhsScalar* LhsBlock;
+  typedef RhsScalar* RhsBlock;
+
+  // Packed Lhs/Rhs block memory allocator.
+  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>
+      BlockMemAllocator;
+  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<
+      LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+      Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+                                  typename RhsMapper::SubMapper, Traits::nr,
+                                  ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+                                OutputMapper, Traits::mr, Traits::nr,
+      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,
+                                            RhsBlock* rhs_block) {
+    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(
+      Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
+      const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
+      std::vector<RhsBlock>* rhs_blocks) {
+    return BlockMemAllocator::allocateSlices(
+        d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    BlockMemAllocator::deallocate(d, handle);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(
+      LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+        /*offset*/ 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(
+      RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(
+      const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
+      const RhsBlock& rhsBlock, const StorageIndex rows,
+      const StorageIndex depth, const StorageIndex cols,
+      const ResScalar alpha, const ResScalar beta) {
+    // Default GEBP kernel does not support beta.
+    eigen_assert(beta == ResScalar(1));
+    static const int kComputeStrideFromBlockDimensions = -1;
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+        /*strideA*/ kComputeStrideFromBlockDimensions,
+        /*strideB*/ kComputeStrideFromBlockDimensions,
+        /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+
+ private:
+  // These are dimensions of the original Tensors, and selected block sizes. The
+  // actual block sizes passed to all function above might be smaller because of
+  // the partial blocks at the end.
+  const StorageIndex m;
+  const StorageIndex k;
+  const StorageIndex n;
+  const StorageIndex bm;
+  const StorageIndex bk;
+  const StorageIndex bn;
+};
+
 }  // end namespace internal
 
-template<typename Indices, typename LhsXprType, typename RhsXprType>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+  // TensorContraction evaluator assumes that both tensors are in ColMajor
+  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+  bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+//   1. Elementwise Relu transformation following Conv2D.
+//   2. AddBias to the Conv2D output channels dimension.
+//
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+  /**
+   * Tensor contraction evaluator calls this kernel after finishing each block
+   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+   *
+   * TensorContractionParams contains contraction dimensions information
+   * required to map output 2-d space into the expected output tensor space
+   * (potentially higher dimensional).
+   *
+   * \param[in] output_mapper Access to output tensor memory
+   * \param[in] params   Tensor contraction parameters
+   * \param[in] i        Index of a first row available through output_mapper
+   * \param[in] j        Index of a first column available through output_mapper
+   * \param[in] num_rows Number of available rows
+   * \param[in] num_cols Number of available columns
+   */
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams& params, Index i,
+      Index j, Index num_rows, Index num_cols) const {
+    EIGEN_UNUSED_VARIABLE(output_mapper);
+    EIGEN_UNUSED_VARIABLE(params);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(num_rows);
+    EIGEN_UNUSED_VARIABLE(num_cols);
+  }
+};
+
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
   typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
-                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
-      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
+      const OutputKernelType& output_kernel = OutputKernelType())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
+        m_output_kernel(output_kernel) {}
 
   EIGEN_DEVICE_FUNC
   const Indices& indices() const { return m_indices; }
@@ -98,35 +350,48 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
   const typename internal::remove_all<typename RhsXprType::Nested>::type&
   rhsExpression() const { return m_rhs_xpr; }
 
+  EIGEN_DEVICE_FUNC
+  const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
     const Indices m_indices;
+    const OutputKernelType m_output_kernel;
 };
 
 
 template<typename Derived>
-struct TensorContractionEvaluatorBase
+struct TensorContractionEvaluatorBase : internal::no_assignment_operator
 {
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
   typedef typename internal::traits<Derived>::Device Device;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = false,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   // Most of the code is assuming that both input tensors are ColMajor. If the
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
@@ -136,6 +401,9 @@ struct TensorContractionEvaluatorBase
   typedef typename internal::conditional<
     static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
   static const int RDims =
@@ -149,16 +417,17 @@ struct TensorContractionEvaluatorBase
 
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_STRONG_INLINE
   TensorContractionEvaluatorBase(const XprType& op, const Device& device)
-    : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.lhsExpression(), op.rhsExpression()), device),
-    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                          op.rhsExpression(), op.lhsExpression()), device),
+        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                           op.rhsExpression(), op.lhsExpression()), device),
         m_device(device),
+        m_output_kernel(op.outputKernel()),
         m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
 
@@ -233,7 +502,7 @@ struct TensorContractionEvaluatorBase
     // dimensions and right non-contracting dimensions.
     m_lhs_inner_dim_contiguous = true;
     int dim_idx = 0;
-    unsigned int nocontract_idx = 0;
+    Index nocontract_idx = 0;
 
     for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
@@ -323,64 +592,144 @@ struct TensorContractionEvaluatorBase
         numext::swap(m_dimensions[i], m_dimensions[j]);
       }
     }
+
+    // A set of parameters that will allow output kernel to get from output
+    // tensor dimensions (i, j) into the original tensor dimensions.
+    // TODO(ezhulenev): Add parameters required to infer output tensor index for
+    // more complex contractions than 2x2 on internal dimension.
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+        if (dest) {
+          evalToAsync(dest, [done]() { done(false); });
+        } else {
+          m_result = static_cast<EvaluatorPointerType>(
+              m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          evalToAsync(m_result, [done]() { done(true); });
         }
-      }
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+#ifndef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+  if (this->m_lhs_inner_dim_contiguous) {                    \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, true, true, ALIGNMENT> ARGS;            \
+      } else {                                               \
+        METHOD<true, true, false, ALIGNMENT> ARGS;           \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, false, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<true, false, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    }                                                        \
+  } else {                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, true, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<false, true, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, false, true, ALIGNMENT> ARGS;          \
+      } else {                                               \
+        METHOD<false, false, false, ALIGNMENT> ARGS;         \
+      }                                                      \
+    }                                                        \
+  }
+#endif
+
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+  if (this->m_lhs_inner_dim_contiguous) {                                    \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
+      } else {                                                               \
+        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    }                                                                        \
+  } else {                                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
+      } else {                                                               \
+        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
+      }                                                                      \
+    }                                                                        \
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalToCallback>
+  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+    static_cast<const Derived*>(this)
+        ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
+                                                               std::move(done));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  void evalProductSequential(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous,
+                              rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                              Alignment>(buffer);
+    } else {
+      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                              rhs_inner_dim_reordered, Alignment>(buffer);
     }
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
@@ -418,12 +767,41 @@ struct TensorContractionEvaluatorBase
     internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
         rows, cols, lhs, rhs,
         buffer, resIncr, alpha);
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
+                    static_cast<Index>(0), static_cast<Index>(0), rows,
+                    static_cast<Index>(1));
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+  void evalGemm(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
+    this->template evalGemmPartial<lhs_inner_dim_contiguous,
+                                   rhs_inner_dim_contiguous,
+                                   rhs_inner_dim_reordered,
+                                   Alignment, true>(buffer, 0, k, 1);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+      bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
+      Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                    rhs_inner_dim_reordered, Alignment,
+        /*use_output_kernel*/ false>(buffer, k_start, k_end,
+                                     num_threads);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
+  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
+    // columns in slice on left side, rows on right side
+    const Index k_slice = k_end - k_start;
 
     // rows in left side
     const Index m = this->m_i_size;
@@ -431,16 +809,9 @@ struct TensorContractionEvaluatorBase
     // columns in right side
     const Index n = this->m_j_size;
 
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    // define mr, nr, and all of my data mapper types
+    // define data mappers for Lhs and Rhs
     typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-    const Index nr = Traits::nr;
-    const Index mr = Traits::mr;
 
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
@@ -462,11 +833,9 @@ struct TensorContractionEvaluatorBase
 
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
 
-    // Declare GEBP packing and kernel structs
-    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
-    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
-
-    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
 
     // initialize data mappers
     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -478,42 +847,72 @@ struct TensorContractionEvaluatorBase
     OutputMapper output(buffer, m);
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
+    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
+                                        Index, internal::ShardByCol>
+        blocking(k_slice, m, n, num_threads);
     const Index kc = blocking.kc();
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
 
-    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
-    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+
+    LhsBlock blockA;
+    RhsBlock blockB;
+
+    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
+
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    const BlockMemHandle packed_mem =
+        kernel.allocate(this->m_device, &blockA, &blockB);
+
+    // If a contraction kernel does not support beta, explicitly initialize
+    // output buffer with zeroes.
+    if (!TensorContractionKernel::HasBeta) {
+      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+    }
 
     for(Index i2=0; i2<m; i2+=mc)
     {
       const Index actual_mc = numext::mini(i2+mc,m)-i2;
-      for (Index k2 = 0; k2 < k; k2 += kc) {
+      for (Index k2 = k_start; k2 < k_end; k2 += kc) {
         // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
-        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
+        const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
+        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // If kernel supports beta, there is no need to initialize output
+        // buffer with zeroes.
+        const Scalar alpha = Scalar(1);
+        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
+                                ? Scalar(0)
+                                : Scalar(1);
 
         // series of horizontal blocks
         for (Index j2 = 0; j2 < n; j2 += nc) {
           // make sure we don't overshoot right edge of right matrix, then pack block
           const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
+          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
+                         actual_nc);
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
+          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
+          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
+                        actual_nc, alpha, beta);
+
+          // We are done with this [i2, j2] output block.
+          if (use_output_kernel && k2 + kc >= k_end) {
+            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
+                            actual_mc, actual_nc);
+          }
         }
       }
     }
 
-    this->m_device.deallocate(blockA);
-    this->m_device.deallocate(blockB);
+    kernel.deallocate(this->m_device, packed_mem);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
 
@@ -536,11 +935,9 @@ struct TensorContractionEvaluatorBase
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
 
-  protected:
-  // Prevent assignment
-  TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
+protected:
   Dimensions m_dimensions;
 
   contract_t m_k_strides;
@@ -560,22 +957,25 @@ struct TensorContractionEvaluatorBase
   Index m_j_size;
   Index m_k_size;
 
+  TensorContractionParams m_tensor_contraction_params;
+
   TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
-  const Device& m_device;
-  Scalar* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  OutputKernelType m_output_kernel;
+  EvaluatorPointerType m_result;
 };
 
 
 // evaluator for default device
-template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
     public TensorContractionEvaluatorBase<
-      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -609,17 +1009,12 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-      return;
-    }
-
-    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 5cf7b4f71..974feb0ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -21,14 +21,28 @@ enum {
 
 
 // Default Blocking Strategy
-template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
+template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
 class TensorContractionBlocking {
  public:
 
-  typedef typename LhsMapper::Scalar LhsScalar;
-  typedef typename RhsMapper::Scalar RhsScalar;
+ /*
+   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
+     which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+     (else HIPCC will error out)
 
-  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
+   However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+   results in NVCC erroring out with the following error
+
+   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
+ */
+
+  #if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+  #endif
+ TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -37,19 +51,22 @@ class TensorContractionBlocking {
     else {
       computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
     }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
 
  private:
-  Index kc_;
-  Index mc_;
-  Index nc_;
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
 };
 
-
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index d65dbb40f..3f315fedc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1,1391 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
+#endif
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in threadIdx.x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (threadIdx.x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
-
-  if (threadIdx.x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
-
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(512)
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-  typedef float Scalar;
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i=0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-#define prefetch_lhs(reg, row, col)                   \
-    if (!CHECK_LHS_BOUNDARY) {                        \
-      if (col < k_size) {                             \
-        reg =lhs.loadPacket<Unaligned>(row, col);     \
-      }                                               \
-    } else {                                          \
-      if (col < k_size) {                             \
-        if (row + 3 < m_size) {                       \
-          reg =lhs.loadPacket<Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-          reg.z =lhs(row + 2, col);                   \
-        } else if (row + 1 < m_size) {                \
-          reg.x =lhs(row + 0, col);                   \
-          reg.y =lhs(row + 1, col);                   \
-        } else if (row  < m_size) {                   \
-          reg.x =lhs(row + 0, col);                   \
-        }                                             \
-      }                                               \
-    }                                                 \
-
-
-  Index lhs_vert = base_m+threadIdx.x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf0 = internal::pset1<float4>(0);
-
-    Index lhs_horiz = threadIdx.y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(threadIdx.x%4)*4;
-    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((threadIdx.x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    if((threadIdx.x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
-
-      int start_feature = threadIdx.y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = threadIdx.y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ EIGEN_STRONG_INLINE void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-  typedef float Scalar;
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-
-  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-      } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-      } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-      } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+threadIdx.x*4;
-    Index rhs_horiz0 = threadIdx.y*2+base_n;
-    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+threadIdx.x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+threadIdx.x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
-
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-
-  __syncthreads();
-  Index horiz_base = (threadIdx.y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  typedef float2 LHS_MEM16x16[32][16];
-  typedef float2 RHS_MEM16x16[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  bool check_rhs = (base_n + 63) >= n_size;
-  bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-__launch_bounds__(256)
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-    const Index m_blocks = (m + 63) / 64;
-    const Index n_blocks = (n + 63) / 64;
-    const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
-    LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and __CUDACC__
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#include "TensorContractionGpu.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 000000000..c81803827
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -0,0 +1,1413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+#define prefetch_lhs(reg, row, col)                            \
+    if (!CHECK_LHS_BOUNDARY) {                                 \
+      if (col < k_size) {                                      \
+        reg =lhs.template loadPacket<float4,Unaligned>(row, col);     \
+      }                                                        \
+    } else {                                                   \
+      if (col < k_size) {                                      \
+        if (row + 3 < m_size) {                                \
+          reg =lhs.template loadPacket<float4,Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+          reg.z =lhs(row + 2, col);                            \
+        } else if (row + 1 < m_size) {                         \
+          reg.x =lhs(row + 0, col);                            \
+          reg.y =lhs(row + 1, col);                            \
+        } else if (row  < m_size) {                            \
+          reg.x =lhs(row + 0, col);                            \
+        }                                                      \
+      }                                                        \
+    }							       \
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    #else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+    #endif
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device)
+  {
+    EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+                          GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+  }
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 9b2cb3ff6..9ab900b4a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,8 +22,19 @@ enum {
 /*
  * Implementation of the Eigen blas_data_mapper class for tensors.
  */
-
-template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
+/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
+/// is scalar * for CoeffLoader.
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
+struct CoeffLoader;
+
+template <typename Scalar, typename Index, int side, typename Tensor,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class BaseTensorContractionMapper;
+
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
+struct CoeffLoader {
   enum {
     DirectOffsets = false
   };
@@ -34,6 +45,12 @@ template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
     eigen_assert(false && "unsupported");
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+  data() const {
+    eigen_assert(false && "unsupported");
+    return NULL;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
 
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -42,12 +59,19 @@ template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
     return m_tensor.template packet<LoadMode>(index);
   }
 
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_tensor.bind(cgh);
+  }
+  #endif
 
  private:
   const Tensor m_tensor;
 };
 
-template <typename Tensor> struct CoeffLoader<Tensor, true> {
+template <typename Tensor, template <class> class MakePointer_>
+struct CoeffLoader<Tensor, true, MakePointer_> {
   enum {
     DirectOffsets = true
   };
@@ -58,6 +82,11 @@ template <typename Tensor> struct CoeffLoader<Tensor, true> {
     m_data += offset;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+  data() const {
+    return m_data;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
 
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -65,15 +94,23 @@ template <typename Tensor> struct CoeffLoader<Tensor, true> {
   {
     return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
   }
+
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+  #endif
  private:
   typedef typename Tensor::Scalar Scalar;
-  const Scalar* m_data;
+
+  typename MakePointer_<const Scalar>::Type m_data;
 };
 
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, int Alignment>
+         int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
 class SimpleTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
@@ -89,7 +126,7 @@ class SimpleTensorContractionMapper {
       m_k_strides(k_strides) { }
 
   enum {
-    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
   };
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@@ -113,8 +150,10 @@ class SimpleTensorContractionMapper {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
     const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
     Index nocontract_val = left ? row : col;
     Index linidx = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
       const Index idx = nocontract_val / m_ij_strides[i];
       linidx += idx * m_nocontract_strides[i];
@@ -131,6 +170,7 @@ class SimpleTensorContractionMapper {
 
     Index contract_val = left ? col : row;
     if(array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
         const Index idx = contract_val / m_k_strides[i];
         linidx += idx * m_contract_strides[i];
@@ -151,9 +191,11 @@ class SimpleTensorContractionMapper {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
     const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
     Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
     Index linidx[2] = {0, 0};
     if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
         const Index idx0 = nocontract_val[0] / m_ij_strides[i];
         const Index idx1 = nocontract_val[1] / m_ij_strides[i];
@@ -174,6 +216,7 @@ class SimpleTensorContractionMapper {
 
     Index contract_val[2] = {left ? col : row, left ? col : row + distance};
     if (array_size<contract_t>::value> 0) {
+      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
         const Index idx0 = contract_val[0] / m_k_strides[i];
         const Index idx1 = contract_val[1] / m_k_strides[i];
@@ -205,24 +248,41 @@ class SimpleTensorContractionMapper {
     return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
   }
 
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_tensor.bind(cgh);
+  }
+  #endif
+
+  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
+    return m_tensor;
+  }
+
+  const nocontract_t& nocontract_strides() const {
+    return m_nocontract_strides;
+  }
+  const nocontract_t& ij_strides() const { return m_ij_strides; }
+  const contract_t& contract_strides() const { return m_contract_strides; }
+  const contract_t& k_strides() const { return m_k_strides; }
+
  protected:
-  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
+  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
   const nocontract_t m_nocontract_strides;
   const nocontract_t m_ij_strides;
   const contract_t m_contract_strides;
   const contract_t m_k_strides;
 };
 
-
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
+         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -232,12 +292,11 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
                               const contract_t& k_strides) :
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
-  typedef typename Tensor::PacketReturnType Packet;
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
-
-  template <int AlignmentType>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
+  load(Index i, Index j) const
+  {
     // whole method makes column major assumption
 
     // don't need to add offsets for now (because operator handles that)
@@ -252,7 +311,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
 
     const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
     const Index first = indexPair.first;
-    const Index last = indexPair.second;
+    const Index lastIdx = indexPair.second;
 
     // We can always do optimized packet reads from left hand side right now, because
     // the vertical matrix dimension on the left hand side is never contracting.
@@ -260,7 +319,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     // been shuffled first.
     if (Tensor::PacketAccess &&
         (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
-        (last - first) == (packet_size - 1)) {
+        (lastIdx - first) == (packet_size - 1)) {
 
       return this->m_tensor.template packet<AlignmentType>(first);
     }
@@ -268,31 +327,44 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     EIGEN_ALIGN_MAX Scalar data[packet_size];
 
     data[0] = this->m_tensor.coeff(first);
+    EIGEN_UNROLL_LOOP
     for (Index k = 1; k < packet_size - 1; k += 2) {
       const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
       data[k] = this->m_tensor.coeff(internal_pair.first);
       data[k + 1] = this->m_tensor.coeff(internal_pair.second);
     }
-    data[packet_size - 1] = this->m_tensor.coeff(last);
+    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
 
-    return pload<Packet>(data);
+    return pload<PacketT>(data);
   }
 
-  template <int AlignmentType>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
-    // whole method makes column major assumption
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
+  load(Index i, Index j) const
+  {
+    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
 
-    // don't need to add offsets for now (because operator handles that)
-    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
-    if (half_packet_size == packet_size) {
-      return loadPacket<AlignmentType>(i, j);
-    }
-    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
-    for (Index k = 0; k < half_packet_size; k++) {
-      data[k] = operator()(i + k, j);
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
     }
-    return pload<HalfPacket>(data);
+    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT,int AlignmentType>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    return this->load<PacketT,AlignmentType>(i,j);
   }
 };
 
@@ -301,11 +373,12 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
+         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+  : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -315,16 +388,17 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
                               const contract_t& k_strides) :
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
-  typedef typename Tensor::PacketReturnType Packet;
-  template <int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
+  template <typename PacketT,int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
     EIGEN_ALIGN_MAX Scalar data[1];
     data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<typename Tensor::PacketReturnType>(data);
+    return pload<PacketT>(data);
   }
-  template <int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
-    return loadPacket(i, j);
+  template <typename PacketT,int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
   }
 };
 
@@ -333,14 +407,12 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
 class TensorContractionSubMapper {
  public:
-  typedef typename Tensor::PacketReturnType Packet;
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
 
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
   typedef Self LinearMapper;
 
   enum {
@@ -372,27 +444,32 @@ class TensorContractionSubMapper {
     return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<Alignment>(i, 0);
+      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
   }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<Alignment>(i, j);
+      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
     }
-    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
+      return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
     }
-    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
     if (UseDirectOffsets) {
       m_base_mapper.storePacket(i, 0, p);
     }
@@ -408,19 +485,30 @@ class TensorContractionSubMapper {
 
   template <typename PacketT, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
     if (UseDirectOffsets) {
-     return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
+     return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
   }
 
-  template <typename Packet>
+  template <typename PacketT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
     return false;
   }
 
+  #ifdef EIGEN_USE_SYCL
+  // The placeholder accessors require to be bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_base_mapper.bind(cgh);
+  }
+  #endif
+
+  const ParentMapper& base_mapper() const { return m_base_mapper; }
+  Index vert_offset() const { return m_vert_offset; }
+  Index horiz_offset() const { return m_horiz_offset; }
+
  private:
   ParentMapper m_base_mapper;
   const Index m_vert_offset;
@@ -432,14 +520,14 @@ template<typename Scalar_, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,  template <class> class MakePointer_=MakePointer>
 class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
 
  public:
   typedef Scalar_ Scalar;
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
   typedef SubMapper VectorMapper;
 
   EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
@@ -457,9 +545,29 @@ class TensorContractionInputMapper
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(*this, i, j);
   }
+  
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+    return Base::m_tensor;
+  }
 };
 
 
+template <typename T> struct TensorContractionInputMapperTrait;
+
+template<typename Scalar_, typename Index_, int side_,
+         typename Tensor_,
+         typename nocontract_t_, typename contract_t_,
+         int packet_size_,
+         bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,  template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, 
+                                                    nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, 
+                                                    inner_dim_reordered_, Alignment_, MakePointer_> > {
+
+      typedef Tensor_ XprType;
+      static const bool  inner_dim_contiguous = inner_dim_contiguous_;
+      static const bool  inner_dim_reordered = inner_dim_reordered_;
+  };  
+
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
new file mode 100755
index 000000000..473c22849
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -0,0 +1,1650 @@
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorContractionSycl.h
+ *
+ * \brief:
+ *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex  determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimNC: determines the tile size for the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  // TileSizeDimC: determines the tile size for the contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimM: determines the tile size for the m dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  // TileSizeDimN: determines the tile size for the n dimension
+  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
+  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+  // BC : determines if supporting bank conflict is required
+  static EIGEN_CONSTEXPR bool BC = true;
+  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
+  static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+      false;
+#else
+      true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+          typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+  return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
+    write(PacketType &packet_data, DataScalar ptr) {
+  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < PacketSize; i++) {
+    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+    ptr += ld;
+  }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+    Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+  *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+  return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+  return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+  typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
+  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+  const StorageIndex linearLocalThreadId;
+  const StorageIndex kGroupId;
+  const StorageIndex mGroupOffset;
+  const StorageIndex nGroupOffset;
+  const StorageIndex kGroupOffset;
+  const StorageIndex mLocalOffset;
+  const StorageIndex nLocalOffset;
+  const StorageIndex mGlobalOffset;
+  const StorageIndex nGlobalOffset;
+  StorageIndex kSize;
+  const bool is_internal;
+  // this is used to adjust the last block
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+      StorageIndex kSize_, const bool is_internal_)
+      : linearLocalThreadId(linearLocalThreadId_),
+        kGroupId(kGroupId_),
+        mGroupOffset(mGroupOffset_),
+        nGroupOffset(nGroupOffset_),
+        kGroupOffset(kGroupOffset_),
+        mLocalOffset(mLocalOffset_),
+        nLocalOffset(nLocalOffset_),
+        mGlobalOffset(mGlobalOffset_),
+        nGlobalOffset(nGlobalOffset_),
+        kSize(kSize_),
+        is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  static EIGEN_CONSTEXPR bool is_lhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+  static EIGEN_CONSTEXPR bool is_rhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+                          PacketReturnType>
+      LHSBlockProperties;
+
+  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+                          PacketReturnType>
+      RHSBlockProperties;
+
+  static EIGEN_CONSTEXPR StorageIndex NStride =
+      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+  typedef
+      typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
+          tile_ptr;
+  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimM + Properties::BC
+                                                 : Properties::WorkLoadPerThreadM;
+  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+                                                 ? Properties::TileSizeDimN + Properties::BC
+                                                 : Properties::WorkLoadPerThreadN;
+  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+  /**
+   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+   * different type of memory needed when local/no_local memory computation is called.
+   *
+   * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
+   of the algorithm to be used
+   * \tparam the private memory size
+   * \param ptr the tile memory pointer type
+   */
+  template <contraction_type, StorageIndex>
+  struct MemHolder {
+    tile_ptr ptr;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+  };
+  /**
+   * \brief specialization of memHolder class when no local memory kernel is used.
+   */
+  template <StorageIndex MemSize>
+  struct MemHolder<contraction_type::no_local, MemSize> {
+    OutScalar ptr[MemSize] = {OutScalar{0}};
+  };
+  /**
+   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
+   * global memory to local/private memory when local/no_local algorithm used.
+   *
+   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
+   * same as lhs_scratch_extract for private memory.
+   *
+   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
+   * same as rhs_scratch_extract for private memory.
+   */
+  struct TiledMemory {
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+    tile_ptr lhs_scratch_ptr_compute;
+    tile_ptr rhs_scratch_ptr_compute;
+    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
+        : lhs_scratch_extract{},
+          rhs_scratch_extract{},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
+                typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
+        : lhs_scratch_extract{block_start_ptr},
+          rhs_scratch_extract{lhs_scratch_extract.ptr +
+                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+          lhs_extract_index(
+              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+          rhs_extract_index(
+              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+  };
+
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex groupSizeM;
+  const StorageIndex groupSizeN;
+  const StorageIndex numTiles;
+  const TripleDim triple_dim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex groupSizeN_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : scratch(scratch_),
+        lhs(lhs_),
+        rhs(rhs_),
+        out_res(out_res_),
+        groupSizeM(groupSizeM_),
+        groupSizeN(groupSizeN_),
+        numTiles(numTiles_),
+        triple_dim(triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+                             triple_dim.K - kGroupOffset >= kSizePerWG;
+    // this is used to adjust the last block
+    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+    // tile
+    kGroupOffset += kSize;
+
+    auto thread_properties =
+        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
+  }
+  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+  // it only compute the block on each thread's private memory space
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+                                                                    PacketReturnType *privateRes) {
+    StorageIndex idx = 0;
+    EIGEN_CONSTEXPR StorageIndex lhs_stride =
+        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+      StorageIndex lhs_index = 0;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+        PacketReturnType lhsPack{};
+        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+                                                                                             lhs_block_ptr + lhs_index);
+        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+        lhs_index += lhs_stride;
+        idx++;
+      }
+    }
+  }
+  // The store function write the computed contraction operation in the private memory of each thread to the global
+  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+  // class.
+  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+    };
+    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+    // WorkLoadPerThreadN slice of N
+    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+      // output leading dimension
+      StorageIndex outputLD = 0;
+      // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
+      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+      // not used and RHS is transposed we packetize the load for RHS.
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+        StorageIndex globalRow = mGlobalOffset;
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+          PacketReturnType privetOut = privateRes[wLPTM];
+          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+            // StorageIndex Therefore it is always coalesced layout
+            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+          } else {
+            EIGEN_UNROLL_LOOP
+            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+              StorageIndex mOffset = globalRow + mId;
+              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+                out_ptr[mOffset + outputLD] =
+                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+              }
+            }
+          }
+          globalRow += (PacketSize * Properties::LocalThreadSizeM);
+        }
+        outputLD += triple_dim.M;
+        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+      }
+      out_ptr += (GlobalNStride * outputLD);
+
+      nGlobalOffset += (PrivateNStride * GlobalNStride);
+    }
+  }
+  // when no local memory is used the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
+      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+    StorageIndex cIndex = cOffset;
+
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+      StorageIndex ncIndex = ncOffset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+          auto val =
+              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                data_source::private_mem>(val, private_ptr);
+        } else {
+          EIGEN_UNROLL_LOOP
+          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+            OutScalar val =
+                (ncInd < NC && cInd < triple_dim.K)
+                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                          inpt, ncInd, cInd, ld)
+                    : OutScalar(0);
+            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                  data_source::private_mem>(
+                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+          }
+        }
+
+        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+        private_ptr += InputBlockProperties::nc_stride;
+      }
+      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+      cIndex += InputBlockProperties::c_stride;
+    }
+  }
+  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+      const StorageIndex &linearLocalThreadId) {
+    const StorageIndex localThreadNC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    const StorageIndex localThreadC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+    db_offset = !db_offset;
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
+      sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+
+  template <contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
+      sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
+    return;
+  }
+
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
+      sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+                      itemID
+#endif
+                  ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+    return;
+#endif
+  }
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
+      sync_thread(const cl::sycl::nd_item<1> &itemID) {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+  template <bool need_sync>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
+      const cl::sycl::nd_item<1> &) {
+    return;
+  }
+
+  template <bool is_internal_block>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+                                                                    ThreadProperties<StorageIndex> &thread_properties,
+                                                                    TiledMemory &tiled_input_block,
+                                                                    PacketReturnType *privateRes, bool &db_offset) {
+    // Tiling the Rhs block from global to local memory
+    extract_block<RHSBlockProperties, is_internal_block>(
+        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+        tiled_input_block.rhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+    // Tiling the Lhs block from global to local memory
+    extract_block<LHSBlockProperties, is_internal_block>(
+        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+        tiled_input_block.lhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    // itemID.barrier(cl::sycl::access::fence_space::local_space);
+    sync_thread<contraction_tp == contraction_type::local>(itemID);
+    // switch to compute mede
+    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+    // Loop over the values of a single tile
+    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+      lhs_offset += LSDL;
+      rhs_offset += LSDR;
+    }
+    // computing the K index for the next tile
+    thread_properties.kSize -= Properties::TileSizeDimK;
+    sync_mem(itemID, db_offset);
+  }
+
+  // when local memory is available the following compute_panel will be enabled
+  template <bool is_internal_block, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+                                                           ThreadProperties<StorageIndex> &thread_properties,
+                                                           OutPtr out_ptr) {
+    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+    // Allocate register space
+    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+        PacketReturnType{0}};
+    bool db_offset = 0;
+
+    while (thread_properties.kSize >= Properties::TileSizeDimK) {
+      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+    if (thread_properties.kSize > 0) {
+      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+
+    // Storing the final results in the output
+    store<is_internal_block,
+          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+        thread_properties.nGlobalOffset);
+  }
+  // When local memory is available the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
+      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
+                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
+    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+                  " LocalOffset must be divisable by stride");
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+    StorageIndex localThreadNC = local_index.first;
+    StorageIndex localThreadC = local_index.second;
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+        auto val =
+            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                     (InputBlockProperties::c_stride * localThreadC * LSD));
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+          OutScalar val =
+              (nCInd < NC && cInd < triple_dim.K)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, nCInd, cInd, ld)
+                  : OutScalar(0);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                       ((InputBlockProperties::c_stride * localThreadC +
+                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+                        LSD));
+        }
+      }
+      localThreadNC += (InputBlockProperties::is_coalesced_layout)
+                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+      localThreadC += (InputBlockProperties::is_coalesced_layout)
+                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    }
+  }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static EIGEN_CONSTEXPR int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+      VecBlockProperties;
+
+  Scratch scratch;
+  const VectorMapper vec;
+  const TensorMapper mat;
+  OutAccessor out_res;
+  const StorageIndex nonContractGroupSize;
+  const StorageIndex nonContractDim;
+  const StorageIndex contractDim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+                                                            const TensorMapper mat_, OutAccessor out_res_,
+                                                            const StorageIndex nonContractGroupSize_,
+                                                            const StorageIndex nonContractDim_,
+                                                            const StorageIndex contractDim_)
+      : scratch(scratch_),
+        vec(vec_),
+        mat(mat_),
+        out_res(out_res_),
+        nonContractGroupSize(nonContractGroupSize_),
+        nonContractDim(nonContractDim_),
+        contractDim(contractDim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto scratch_ptr = scratch.get_pointer();
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
+    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
+    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+    const StorageIndex nonContractGroupId =
+        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+    const StorageIndex contractGroupId =
+        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+    auto local_output = scratch_ptr + OutScratchOffset;
+    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+    is_internal
+        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                              scratch_ptr, contractGroupOffset,
+#endif
+                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                               scratch_ptr, contractGroupOffset,
+#endif
+                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+  }
+  template <bool is_internal_block, typename OutPtr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+      OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+    // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+    extract_block<VecBlockProperties, is_internal_block, KFactor,
+                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+                                                                                vectorOffset, contractDim);
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+    StorageIndex privateOffsetC = 0;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+      StorageIndex privateOffsetNC = 0;
+      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      auto vecScalar = *in_scratch_ptr;
+#else
+      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+                           : OutScalar(0);
+#endif
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        auto matScalar = (check_boundary<is_internal_block>(
+                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+                                              : globalNonContractDimOffset + privateOffsetNC,
+                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+                                              : globalContractDimOffset + privateOffsetC)
+                             : OutScalar(0);
+
+        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        privateOffsetNC += Properties::LocalThreadSizeNC;
+      }
+      privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+    }
+
+    auto out_scratch_ptr = local_output + outScratchIndex;
+    // Each block of 16*16 element in shared memory should reduce to 16*1
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      *out_scratch_ptr = outScalar[j];
+
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+    if (is_lhs_vec) {
+      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    }
+
+    out_scratch_ptr = local_output + outScratchIndex;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (contractId < offset) {
+          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+        }
+      }
+      // moving to next 16 by 16 block
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+
+    if (contractId == 0) {
+      out_scratch_ptr = local_output + nonContractId;
+      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+      out_ptr += global_final_offset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+          auto res = *out_scratch_ptr;
+
+          *out_ptr = res;
+          out_ptr += Properties::LocalThreadSizeNC;
+        }
+        // moving to next 16 by 16 block to ge the next 16 reduced elements
+        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+      }
+    }
+  }
+
+  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+            typename Local>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+                                                                  const StorageIndex &linearLocalThreadId,
+                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
+    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+    StorageIndex cIndex = cOffset;
+    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+                                                                                              cIndex, StorageIndex(1));
+        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          OutScalar val =
+              (cIndex + i < C)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+                  : OutScalar(0);
+          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+        }
+      }
+      local_ptr += InputBlockProperties::c_stride * GroupSize;
+      cIndex += InputBlockProperties::c_stride * GroupSize;
+    }
+  }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determins the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex rng;
+
+  EIGEN_DEVICE_FUNC
+  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
+                           const StorageIndex rng_)
+      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_res.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    StorageIndex globalid = itemID.get_global_id(0);
+    StorageIndex localid = itemID.get_local_id(0);
+    OutScalar accumulator = OutScalar(0);
+    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+    }
+    auto out_scratch_ptr = scratch_ptr + localid;
+    *out_scratch_ptr = accumulator;
+    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+      }
+    }
+    if (localid == 0) {
+      out_ptr[itemID.get_group(0)] = accumulator;
+    }
+  }
+};
+#endif
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       Eigen::SyclDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "SYCL tensor contraction does not support output kernels.");
+
+  typedef Eigen::SyclDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index StorageIndex;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Base::Storage Storage;
+  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+  struct TripleDim {
+    const StorageIndex M;
+    const StorageIndex N;
+    const StorageIndex K;
+    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+  };
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+  };
+
+  static EIGEN_CONSTEXPR int LDims = Base::LDims;
+  static EIGEN_CONSTEXPR int RDims = Base::RDims;
+  static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
+
+  typedef array<StorageIndex, LDims> left_dim_mapper_t;
+  typedef array<StorageIndex, RDims> right_dim_mapper_t;
+
+  typedef array<StorageIndex, ContractDims> contract_t;
+  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+  typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
+  typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+  struct input_mapper_propertis {
+    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static EIGEN_CONSTEXPR bool is_rhs_matrix =
+        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+  };
+
+  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (!data) {
+      this->m_result = this->m_device.get(
+          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+      data = this->m_result;
+    }
+    evalToSycl(data);
+    return (this->m_result != NULL);
+  }
+  const Eigen::SyclDevice &device() const { return this->m_device; }
+  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+                                                   right_nocontract_t, contract_t,
+                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+        RhsMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+    if (triple_dim.M == 1 && triple_dim.N == 1) {
+      launchSC(buffer, lhs, rhs, triple_dim.K);
+    } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+        if (triple_dim.M != 1 && triple_dim.N == 1) {
+      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+    } else  // This is equivalent of if (m!=1 && n!=1)
+#endif
+    {
+      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+          inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+      bool skinny = false;
+      auto platform_name = this->device().getPlatformName();
+      // This is based on empirical calculation for AMD r9-nano and Fiji
+      if (platform_name.find("AMD") == 0) {
+        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+      } else {
+        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+      }
+      if (skinny)
+        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+      else
+#endif  // EIGEN_SYCL_DISABLE_SKINNY
+        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+    }
+  }
+
+  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    if (device().has_local_memory()) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+    if (!(device().has_local_memory())) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+  }
+
+  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+            typename Properties, typename LhsMapper, typename RhsMapper>
+  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                const TripleDim &triple_dim) const {
+    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+    StorageIndex groupSizeK =
+        skinny
+            ? std::max(std::min(totalTilesK,
+                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+                                    (groupSizeM * groupSizeN)),
+                       StorageIndex(1))
+            : StorageIndex(1);
+
+    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+    const StorageIndex globalRange = totalGroupSize * localRange;
+
+    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+                                         ? ((Properties::DoubleBuffer + 1) *
+                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+                                                (Properties::TileSizeDimN + Properties::BC))
+                                         : StorageIndex(1);
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (groupSizeK == 1) {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, true, ct>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+    } else {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, false, ct>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+          triple_dim);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      auto op = Op();
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
+
+      device().deallocate_temp(temp_pointer);
+    }
+  }
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+                                    StorageIndex NC, StorageIndex C) const {
+    const StorageIndex nonContractDim = NC;
+    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+        Properties;
+    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+    const StorageIndex globalRange =
+        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+    const StorageIndex scratchSize =
+        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (cNumGroups > 1) {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, false>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+                                cl::sycl::range<1>(localRange)),
+          StorageIndex(1), Op(), nonContractDim, cNumGroups);
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, true>
+          ContractKernelName;
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+    }
+  }
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+  template <typename LhsMapper, typename RhsMapper>
+  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    StorageIndex K) const {
+    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                        "The Local thread size must be a power of 2 for the reduction "
+                        "operation");
+    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+    // reduces at least 512 elementss individually, we get better performance.
+    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+    const StorageIndex global_range = num_work_group * local_range;
+
+    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+        ContractKernelName;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+                                                                                    thread_range, local_range, K);
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, StorageIndex, local_range>
+          GenericRKernel;
+      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, buffer,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
+
+      device().deallocate_temp(temp_pointer);
+    } else {
+      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+                                                                                    local_range, K);
+    }
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    this->m_leftImpl.cleanup();
+    this->m_rightImpl.cleanup();
+
+    if (this->m_result) {
+      this->m_device.deallocate_temp(this->m_result);
+      this->m_result = NULL;
+    }
+  }
+  // The placeholder accessors must bound to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    this->m_leftImpl.bind(cgh);
+    this->m_rightImpl.bind(cgh);
+    this->m_result.bind(cgh);
+  }
+};
+}  // namespace Eigen
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index ee16cde9b..21be6ea42 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,57 +15,16 @@
 
 namespace Eigen {
 
-#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
-namespace internal {
-
-template<typename LhsScalar, typename LhsMapper, typename Index>
-struct packLhsArg {
-  LhsScalar* blockA;
-  const LhsMapper& lhs;
-  const Index m_start;
-  const Index k_start;
-  const Index mc;
-  const Index kc;
-};
-
-template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
-struct packRhsAndKernelArg {
-  const MaxSizeVector<LhsScalar*>* blockAs;
-  RhsScalar* blockB;
-  const RhsMapper& rhs;
-  OutputMapper& output;
-  const Index m;
-  const Index k;
-  const Index n;
-  const Index mc;
-  const Index kc;
-  const Index nc;
-  const Index num_threads;
-  const Index num_blockAs;
-  const Index max_m;
-  const Index k_block_idx;
-  const Index m_block_idx;
-  const Index n_block_idx;
-  const Index m_blocks;
-  const Index n_blocks;
-  MaxSizeVector<Notification*>* kernel_notifications;
-  const MaxSizeVector<Notification*>* lhs_notifications;
-  const bool need_to_pack;
-};
-
-}  // end namespace internal
-#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
-
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
 
   typedef ThreadPoolDevice Device;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -112,40 +71,35 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
+  template <int Alignment>
   void evalProduct(Scalar* buffer) const {
-    typedef
-        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
-            LhsScalar;
-    typedef
-        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
-            RhsScalar;
-    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-        contract_t, internal::packet_traits<LhsScalar>::size,
-        lhs_inner_dim_contiguous, false, Unaligned>
-        LhsMapper;
-    typedef internal::TensorContractionInputMapper<
-        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-        contract_t, internal::packet_traits<RhsScalar>::size,
-        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    typedef internal::gemm_pack_lhs<LhsScalar, Index,
-                                    typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor>
-        LhsPacker;
-    typedef internal::gemm_pack_rhs<
-        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
-        RhsPacker;
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false>
-        GebpKernel;
+    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+  }
+
+  template <typename EvalToCallback, int Alignment>
+  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+  }
+
+  template <typename DoneCallback, int Alignment>
+  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+    // This function computes a lot of heuristics in multiple steps, and it
+    // also has multiple exit points. To keep it sane, readable and all in one
+    // place, sync/async execution decision is made at runtime at the very end.
+    //
+    // (1) In sync mode we allocate Context on the stack, submit computations
+    //     to the device thread pool, and block on a barrier until it is
+    //     completed.
+    //
+    // (2) In async mode we allocate Context on the heap, and after all tasks
+    //     are finished, we call provided the done callback, and delete a
+    //     context from the heap.
+    //
+    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+    // and temporary buffers, requried for executing the tensor contraction.
+    // They are responsible for cleaning it up after contraction is done.
+    static const bool IsEvalInSyncMode =
+        std::is_same<DoneCallback, NoCallback>::value;
 
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
@@ -181,14 +135,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // Again, we don't know number of threads yet, so we use 2.
     Index bm, bn, bk;
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, 2);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, 2);
       bm = blocking.mc();
@@ -204,35 +158,45 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         contractionCost(m, n, bm, bn, bk, shard_by_col, false);
     int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         static_cast<double>(n) * m, cost, this->m_device.numThreads());
+    int num_threads_by_k = numThreadsInnerDim(m, n, k);
+    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
+      // We are in the scenario where it is more effective to shard by the
+      // inner dimension.
+      if (IsEvalInSyncMode) {
+        EvalShardedByInnerDimContext<DoneCallback> ctx(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx.template run<Alignment>();
+      } else {
+        auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
+            this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx->template runAsync<Alignment>();
+      }
+
+      return;
+    }
 
     // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
     // model is not tuned. Remove this when the cost model is tuned.
     if (n == 1) num_threads = 1;
 
     if (num_threads == 1) {
-      // The single-threaded algorithm should be faster in this case.
-      if (n == 1)
-        this->template evalGemv<lhs_inner_dim_contiguous,
-                                rhs_inner_dim_contiguous,
-                                rhs_inner_dim_reordered, Alignment>(buffer);
-      else
-        this->template evalGemm<lhs_inner_dim_contiguous,
-                                rhs_inner_dim_contiguous,
-                                rhs_inner_dim_reordered, Alignment>(buffer);
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
+                                  Unaligned, (buffer));
+      if (!IsEvalInSyncMode) done();
       return;
     }
 
     // Now that we know number of threads, recalculate sharding and blocking.
     shard_by_col = shardByCol(m, n, num_threads);
     if (shard_by_col) {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
@@ -264,6 +228,26 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index nm = divup(nm0, gm);
     Index nn = divup(nn0, gn);
 
+    // If there is enough concurrency in the sharding dimension, we choose not
+    // to paralellize by the other dimension, and execute all kernels in sync
+    // mode. This reduces parallelism from the nm x nn down to nn
+    // (shard_by_col==true) or nm (shard_by_col==false).
+    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+    const int num_worker_threads = this->m_device.numThreadsInPool();
+
+    // With small number of threads we want to make sure that we do not reduce
+    // parallelism too much. With large number of threads we trade maximum
+    // parallelism for better memory locality.
+    const float oversharding_factor =
+        num_worker_threads <= 4  ? 8.0 :
+        num_worker_threads <= 8  ? 4.0 :
+        num_worker_threads <= 16 ? 2.0 :
+        num_worker_threads <= 32 ? 1.0 :
+        num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
+
+    const bool parallelize_by_sharding_dim_only =
+        sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
     // Last by not least, decide whether we want to issue both lhs and rhs
     // packing in parallel; or issue lhs packing first, and then issue rhs
     // packing when lhs packing completes (for !shard_by_col lhs and rhs are
@@ -279,40 +263,139 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // But don't do it if we will use each rhs only once. Locality seems to be
     // more important in this case.
     if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+    // Also don't get in the way of parallelize_by_sharding_dim_only
+    // optimization.
+    if (parallelize_by_sharding_dim_only) parallel_pack = false;
+
+    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    if (IsEvalInSyncMode) {
+#define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   NoCallback())                                                            \
+      .run()
+      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
+                                  CONTEXT_ARGS);
+#undef CONTEXT_ARGS
 
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
-                  this->m_i_strides, this->m_left_contracting_strides,
-                  this->m_k_strides);
+    } else {
+#define CONTEXT_ARGS                                                        \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
+   std::move(done))
+      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
+                                        Alignment, CONTEXT_ARGS, run());
+#undef CONTEXT_ARGS
+    }
+  }
 
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
-                  this->m_j_strides, this->m_right_contracting_strides,
-                  this->m_k_strides);
+  // ------------------------------------------------------------------------ //
 
-    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
-            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
-                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
-                          shard_by_col, parallel_pack)
-        .run();
-  }
+  // Dummy struct to represent an empty DoneCallback.
+
+  struct NoCallback {
+    void operator()() {
+      eigen_assert(false && "NoCallback should never be called");
+    }
+  };
+
+  // ------------------------------------------------------------------------ //
 
-  // Context coordinates a single parallel gemm operation.
-  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
-            typename LhsMapper, typename RhsMapper, typename OutputMapper>
-  class Context {
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification;
+
+  // Synchronous evaluation notification that blocks caller thread in Wait().
+  template <typename Context>
+  class EvalParallelNotification<NoCallback, Context> {
+   public:
+    EvalParallelNotification(Context*, NoCallback) {}
+    void Notify() { done_.Notify(); }
+    void Wait() { done_.Wait(); }
+   private:
+    Eigen::Notification done_;
+  };
+
+  // Asynchronous evaluation notification that does not block in Wait().
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification {
+   public:
+    EvalParallelNotification(Context* ctx, DoneCallback done)
+        : ctx_(ctx), done_(std::move(done)) {}
+
+    void Notify() {
+      // Make a copy of done callback, because it will be destructed when we
+      // will delete context in the next line (EvalParallelNotification is a
+      // data member of EvalParallelContext class).
+      DoneCallback done_copy = std::move(done_);
+
+      // Delete parallel evaluation context.
+      delete ctx_;
+
+      // Now safely call the done callback.
+      done_copy();
+    }
+
+    void Wait() {}
+
+   private:
+    Context* ctx_;
+    DoneCallback done_;
+  };
+
+  // Context orchestrates sync/async parallel contraction evaluation. When it is
+  // executed in asynchronous mode, it owns all the shared state that might be
+  // accessible by block packing and kernel tasks.
+
+  template <typename DoneCallback, bool lhs_inner_dim_contiguous,
+            bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
+            int Alignment>
+  class EvalParallelContext {
    public:
-    Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
-            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
-            Index gn, Index nm0, Index nn0, bool shard_by_col,
-            bool parallel_pack)
-        : device_(device),
-          lhs_(lhs),
-          rhs_(rhs),
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<
+        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+
+    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
+                        Index tm, Index tn, Index tk, Index bm, Index bn,
+                        Index bk, Index nm, Index nn, Index nk, Index gm,
+                        Index gn, Index nm0, Index nn0, bool shard_by_col,
+                        bool parallel_pack,
+                        bool parallelize_by_sharding_dim_only,
+                        DoneCallback done)
+        : created_by_thread_id_(std::this_thread::get_id()),
+          done_(this, std::move(done)),
+          device_(self->m_device),
+          lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
+               self->m_i_strides, self->m_left_contracting_strides,
+               self->m_k_strides),
+          rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
+               self->m_j_strides, self->m_right_contracting_strides,
+               self->m_k_strides),
           buffer_(buffer),
           output_(buffer, tm),
+          output_kernel_(self->m_output_kernel),
+          tensor_contraction_params_(self->m_tensor_contraction_params),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
+          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
           m_(tm),
           n_(tn),
           k_(tk),
@@ -325,13 +408,29 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           gm_(gm),
           gn_(gn),
           nm0_(nm0),
-          nn0_(nn0)
-  {
+          nn0_(nn0),
+          kernel_(m_, k_, n_, bm_, bk_, bn_),
+          num_thread_local_allocations_(0),
+          // We reserve 2X more capacity for a thread local values, than the
+          // number of threads in the pool to efficiently handle task stealing
+          // by threads that are not managed by the pool.
+          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_
+                                         ? device_.numThreadsInPool()
+                                         : 0)),
+          // We will use only one of the Lhs/Rhs thread local storage depending
+          // on the shard_by_col value and we parallelize by sharding dim ONLY.
+          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity,
+                                   {*this}, {*this}),
+          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0,
+                                   {*this}, {*this}) {
+      // These two options are mutually exclusive.
+      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
       for (Index x = 0; x < P; x++) {
         // Normal number of notifications for k slice switch is
         // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
         // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceeding kernels.
+        // from preceding kernels.
         state_switch_[x] =
             x == 0
                 ? 1
@@ -353,57 +452,97 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
 
       // Allocate memory for packed rhs/lhs matrices.
-      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-      size_t lhs_size =
-          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
-      size_t rhs_size =
-          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
-      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
-          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
-      char* mem = static_cast<char*>(packed_mem_);
-      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
-        packed_lhs_[x].resize(nm0_);
-        for (Index m = 0; m < nm0_; m++) {
-          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
-          mem += lhs_size;
-        }
-        packed_rhs_[x].resize(nn0_);
-        for (Index n = 0; n < nn0_; n++) {
-          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
-          mem += rhs_size;
+      packed_mem_ = kernel_.allocateSlices(            //
+          device_,                                     //
+          /*num_lhs=*/nm0_,                            //
+          /*num_rhs=*/nn0_,                            //
+          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
+          packed_lhs_, packed_rhs_);
+
+      if (parallelize_by_sharding_dim_only_) {
+        const int num_worker_threads = device_.numThreadsInPool();
+
+        if (shard_by_col) {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+          for (int i = 0; i < nn_; ++i)
+            can_use_thread_local_packed_[i].store(true,
+                                                  std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gn_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/0,                                        //
+              /*num_rhs=*/num_blocks,                               //
+              /*num_slices=*/1,                                     //
+              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
+
+        } else {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+          for (int i = 0; i < nm_; ++i)
+            can_use_thread_local_packed_[i].store(true,
+                                                  std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gm_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/num_blocks,                               //
+              /*num_rhs=*/0,                                        //
+              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
+              /*rhs_blocks=*/nullptr);
         }
       }
     }
 
-    ~Context() {
+    ~EvalParallelContext() {
       for (Index x = 0; x < P; x++) {
         for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
         delete[] state_kernel_[x];
       }
-      internal::aligned_free(packed_mem_);
+      kernel_.deallocate(device_, packed_mem_);
+      if (parallelize_by_sharding_dim_only_) {
+        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
+        delete[] can_use_thread_local_packed_;
+      }
     }
 
     void run() {
       // Kick off packing of the first slice.
       signal_switch(0, 1);
+
       // Wait for overall completion.
-      // TODO(dvyukov): this wait can lead to deadlock.
-      // If nthreads contractions are concurrently submitted from worker
-      // threads, this wait will block all worker threads and the system will
-      // deadlock.
+      //
+      // If parallel evaluation is executed in async mode, this is a no-op, and
+      // Wait() will return immediately. In synchronous mode it will block the
+      // caller thread until it will receive notification from last task.
+      //
+      // In async mode, last task when completed will call done callback from
+      // the same thread, and will delete this context.
+      //
+      // TODO(dvyukov): This wait can lead to deadlock if contraction is
+      // evaluated in synchronous mode. If nthreads contractions are
+      // concurrently submitted from worker threads, this wait will block all
+      // worker threads and the system will deadlock.
       done_.Wait();
     }
 
    private:
-    Notification done_;
+    std::thread::id created_by_thread_id_;
+
+    // This notification is specialized on the type of DoneCallback and can be
+    // blocking or non-blocking.
+    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
     const Device& device_;
-    LhsMapper& lhs_;
-    RhsMapper& rhs_;
+    LhsMapper lhs_;
+    RhsMapper rhs_;
     Scalar* const buffer_;
     OutputMapper output_;
+    OutputKernelType output_kernel_;
+    TensorContractionParams tensor_contraction_params_;
     const int num_threads_;
     const bool shard_by_col_;
     const bool parallel_pack_;
+    const bool parallelize_by_sharding_dim_only_;
     // Matrix sizes.
     const Index m_;
     const Index n_;
@@ -423,6 +562,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // coarsening).
     const Index nm0_;
     const Index nn0_;
+    // Tensor contraction kernel.
+    TensorContractionKernel kernel_;
 
     // Parallelization strategy.
     //
@@ -459,9 +600,215 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // actively executing + one to track completion of kernels in the second
     // slice.
     static const Index P = 3;
-    void* packed_mem_;
-    std::vector<LhsScalar*> packed_lhs_[P - 1];
-    std::vector<RhsScalar*> packed_rhs_[P - 1];
+
+    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
+    BlockMemHandle packed_mem_;
+    std::vector<LhsBlock> packed_lhs_[P - 1];
+    std::vector<RhsBlock> packed_rhs_[P - 1];
+
+    // If we choose to parallelize only by the sharding dimension, each thread
+    // will have it's own "thead local" (not a c++ thread local storage) memory
+    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+    // can't be passed to a kernel that might execute on a different thread.
+    //
+    // In practice when we are ready to pack memory for the sharding dimension
+    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+    // already computed (99% of the time), and we can pack data into the thread
+    // local storage, and guarantee that all the kernels will be executed
+    // immediately in the same thread. This significantly increases L1 cache hit
+    // ratio and reduces pressure on the memory bus.
+    //
+    // It's still possible that kernel for the K-th slice will be ready before
+    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+    // and packed_rhs_ to allow kernels to be executed later on a thread
+    // different from the thread that was used for packing.
+
+    // Handle for pre-allocated thread local memory buffers.
+    BlockMemHandle thread_local_pre_alocated_mem_;
+
+    // Only one of these will be initialized depending on shard_by_col value
+    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
+    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
+    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
+
+    // How many thread local blocks were already allocated.
+    std::atomic<int> num_thread_local_allocations_;
+    const int thread_local_capacity;
+
+    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
+    // unique threads in a system is below or equal to the number of threads in
+    // a thread pool. We will fallback on dynamic memory allocation after that.
+
+    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
+    // size is equal to the grain size in Lhs/Rhs sharding dimension.
+    template <typename BlockType>
+    class ThreadLocalBlocks {
+     public:
+      ThreadLocalBlocks() = default;
+
+      ThreadLocalBlocks(BlockType* base, size_t grain_size)
+          : is_pre_allocated_(true),
+            thread_local_pre_allocated_base_(base),
+            grain_size_(grain_size) {}
+
+      ThreadLocalBlocks(BlockMemHandle mem_handle,
+                        std::vector<BlockType> blocks)
+          : is_pre_allocated_(false),
+            mem_handle_(std::move(mem_handle)),
+            blocks_(std::move(blocks)) {}
+
+      BlockType& block(int grain_index) {
+        eigen_assert(grain_index >= 0);
+        eigen_assert(static_cast<size_t>(grain_index) < size());
+        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index]
+                                 : blocks_[grain_index];
+      }
+
+      void Release(EvalParallelContext& ctx) const {
+        if (!is_pre_allocated_) {
+          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
+        }
+      }
+
+      size_t size() const {
+        return is_pre_allocated_ ? grain_size_ : blocks_.size();
+      }
+
+     private:
+      bool is_pre_allocated_;
+
+      // Reuse pre-allocated thread local buffers.
+      BlockType* thread_local_pre_allocated_base_ = nullptr;
+      size_t grain_size_ = 0;
+
+      // These will be initialized only if `is_pre_allocated == false`.
+      BlockMemHandle mem_handle_{};
+      std::vector<BlockType> blocks_;
+    };
+
+    // ThreadLocalBlocksInitialize callable does custom thread local blocks
+    // initialization, and will reuse pre-allocated buffers if possible, or will
+    // dynamically allocate new memory.
+    //
+    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
+    // for what side do we plan to do block allocation.
+    template <typename BlockType, bool is_rhs>
+    class ThreadLocalBlocksInitialize {
+      static constexpr bool kIsLhs =
+          !is_rhs && std::is_same<BlockType, LhsBlock>::value;
+      static const bool kIsRhs =
+          is_rhs && std::is_same<BlockType, RhsBlock>::value;
+      static_assert(kIsLhs || kIsRhs, "Unkown block type");
+
+      using Blocks = ThreadLocalBlocks<BlockType>;
+
+     public:
+      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
+          : ctx_(ctx),
+            num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
+
+      void operator()(Blocks& blocks) {
+        const int n = ctx_.num_thread_local_allocations_.fetch_add(
+            1, std::memory_order_relaxed);
+
+        if (n >= num_worker_threads_) {
+          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
+        } else {
+          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
+        }
+      }
+
+     private:
+      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
+      // TensorContractionKernel::allocateSlices into template specializations.
+      // Also explicit specializations are not allowed at class scope in C++03,
+      // EvalCtx type parameter is just a workaround for that limitation.
+      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
+      struct ThreadLocalBlocksAllocator;
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<RhsBlock> rhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+              ctx.device_,
+              /*num_lhs=*/0,
+              /*num_rhs=*/ctx.gn_,
+              /*num_slices=*/1,
+              /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
+
+          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle),
+                                               std::move(rhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
+          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
+        }
+      };
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<LhsBlock> lhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+              ctx.device_,
+              /*num_lhs=*/ctx.gm_,
+              /*num_rhs=*/0,
+              /*num_slices=*/1,
+              /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
+
+          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle),
+                                               std::move(lhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
+          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
+        }
+      };
+
+      EvalParallelContext& ctx_;
+      const int num_worker_threads_;
+    };
+
+    template <typename BlockType>
+    class ThreadLocalBlocksRelease {
+     public:
+      using Blocks = ThreadLocalBlocks<BlockType>;
+      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
+      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
+
+     private:
+      EvalParallelContext& ctx_;
+    };
+
+    // ThreadLocalBlocks initialization callables.
+    using ThreadLocalLhsInit =
+        ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
+    using ThreadLocalRhsInit =
+        ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
+
+    // ThreadLocalBlocks release callables.
+    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
+    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
+
+    // Thread local containers for Lhs/Rhs block packs. In practice only one of
+    // them will be used, depending on the shard_by_col value.
+    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit,
+                       ThreadLocalLhsRelease>
+        lhs_thread_local_blocks_;
+    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit,
+                       ThreadLocalRhsRelease>
+        rhs_thread_local_blocks_;
+
+    // After a particular shard for Kth slice missed thread local execution
+    // opportunity (K-1 slice didn't complete kernels execution), we can no
+    // longer schedule K+1 and following slices in thread local mode, because
+    // there is no more guarantee that previous kernels were executed
+    // sequentially in the same thread (size is nn_ or nm_).
+    std::atomic<bool>* can_use_thread_local_packed_;
+
     std::atomic<uint8_t>** state_kernel_[P];
     // state_switch_ is frequently modified by worker threads, while other
     // fields are read-only after constructor. Let's move it to a separate cache
@@ -470,69 +817,168 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     std::atomic<Index> state_packing_ready_[P];
     std::atomic<Index> state_switch_[P];
 
+    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(!shard_by_col_);
+        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
+
+        Index grain_index = m1 - m * gm_;
+        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_lhs_[k % (P - 1)][m1];
+      }
+    }
+
+    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(shard_by_col_);
+        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
+
+        Index grain_index = n1 - n * gn_;
+        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_rhs_[k % (P - 1)][n1];
+      }
+    }
+
+    // In following two methods (pack_lhs and pack_rhs), if we know for sure
+    // that we'll be able to immediately call a kernel with packed data, and do
+    // not submit it to the thread pool, we can use thread local memory for
+    // packed data.
+    //
+    // We can only reliably check it if we are running all kernels in sync mode
+    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+    // run, it's guaranteed that all kernels with larger values of m (n) are
+    // also ready, because we execute them in the same order for all K slices.
+
     void pack_lhs(Index m, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[m].store(false,
+                                                std::memory_order_relaxed);
+        }
+      }
+
       const Index mend = m * gm_ + gm(m);
       for (Index m1 = m * gm_; m1 < mend; m1++)
-        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
-                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local),
+                        lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
 
       if (!parallel_pack_ && shard_by_col_) {
+        assert(!use_thread_local);
         signal_packing(k);
       } else {
         signal_switch(k + 1);
-        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+        for (Index n = nn_ - 1; n >= 0; n--) {
+          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
       }
     }
 
     void pack_rhs(Index n, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in followig slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[n].store(false,
+                                                std::memory_order_relaxed);
+        }
+      }
+
       const Index nend = n * gn_ + gn(n);
       for (Index n1 = n * gn_; n1 < nend; n1++) {
-        if (k == 0) {
-          // Zero the output memory in parallel.
-          // On 10000x2x10000 mm zeroing can easily take half of time.
-          // Zero (bn x m) row. Safe to do here because all kernels that will
-          // write to this memory depend on completion of this task.
-          // Note: don't call device_.memset() here. device_.memset() blocks on
-          // thread pool worker thread, which can lead to underutilization and
-          // deadlocks.
+        if (!TensorContractionKernel::HasBeta && k == 0) {
+          // Zero the output memory in parallel, only if contraction kernel does
+          // not support `beta`. Otherwise we will pass beta 0.0 to the first
+          // call to the `TensorContractionKernel::invoke()`.
+          //
+          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
+          // x m) row. Safe to do here because all kernels that will write to
+          // this memory depend on completion of this task. Note: don't call
+          // device_.memset() here. device_.memset() blocks on thread pool
+          // worker thread, which can lead to underutilization and deadlocks.
           memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
         }
-        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
-                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
+                        rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
       }
 
       if (parallel_pack_ || shard_by_col_) {
         signal_switch(k + 1);
-        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+        for (Index m = nm_ - 1; m >= 0; m--) {
+          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
       } else {
+        assert(!use_thread_local);
         signal_packing(k);
       }
     }
 
-    void kernel(Index m, Index n, Index k) {
+    void kernel(Index m, Index n, Index k, bool use_thread_local) {
       // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consequetive tasks
+      // because we want to reuse the same packed rhs in consecutive tasks
       // (rhs fits into L2$ while lhs only into L3$).
       const Index nend = n * gn_ + gn(n);
       const Index mend = m * gm_ + gm(m);
+
+      // NOTE: output = alpha * LHS * RHS + beta * output.
+      const Scalar alpha = Scalar(1);
+      const Scalar beta =
+          (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
+
       if (shard_by_col_) {
         for (Index n1 = n * gn_; n1 < nend; n1++) {
-          for (Index m1 = m * gm_; m1 < mend; m1++)
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+          for (Index m1 = m * gm_; m1 < mend; m1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(
+                output_mapper,
+                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+                bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
         }
       } else {
         for (Index m1 = m * gm_; m1 < mend; m1++)
           for (Index n1 = n * gn_; n1 < nend; n1++) {
-            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
-                         packed_lhs_[k % (P - 1)][m1],
-                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
-                         Scalar(1), -1, -1, 0, 0);
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(
+                output_mapper,
+                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+                bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_,
+                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
           }
       }
-      signal_kernel(m, n, k + 1, false);
+      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
       signal_switch(k + 2);
     }
 
@@ -545,16 +991,23 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       enqueue_packing(k, shard_by_col_);
     }
 
-    void signal_kernel(Index m, Index n, Index k, bool sync) {
+    void signal_kernel(Index m, Index n, Index k, bool sync,
+                       bool use_thread_local) {
       std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
       Index s = state->load();
       eigen_assert(s > 0);
-      if (s != 1 && state->fetch_sub(1) != 1) return;
+      if (s != 1 && state->fetch_sub(1) != 1) {
+        eigen_assert(!use_thread_local);
+        return;
+      }
       state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
-      if (sync)
-        kernel(m, n, k);
-      else
-        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+      if (sync) {
+        kernel(m, n, k, use_thread_local);
+      } else {
+        eigen_assert(!use_thread_local);
+        device_.enqueueNoNotification(
+            [=]() { kernel(m, n, k, use_thread_local); });
+      }
     }
 
     void signal_switch(Index k, Index v = 1) {
@@ -604,11 +1057,32 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         else
           pack_lhs(start, k);
       } else {
-        Index mid = (start + end) / 2;
-        device_.enqueueNoNotification(
-            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
-        device_.enqueueNoNotification(
-            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+        while (end - start > 1) {
+          Index mid = (start + end) / 2;
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+          end = mid;
+        }
+
+        // Decide if we want to run first packing task (start == 0) in
+        // async mode if we parallelize only by sharding dim:
+        // (1) pack_lhs and pack_rhs call signal_switch before completing
+        //     all calls to signal_kernel, which in sync mode might lead
+        //     to the execution of the first kernel of the k+1 slice, before
+        //     completing a call to the last kernel of the k slice.
+        // (2) all pack tasks for sharded dim must be executed in a thread
+        //     pool to get pre-allocated thead local buffers.
+        bool pack_async =
+          (start == 0) &&
+          (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
+          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
+
+        if (pack_async) {
+          device_.enqueueNoNotification(
+              [=]() { enqueue_packing_helper(start, end, k, rhs); });
+        } else {
+          enqueue_packing_helper(start, end, k, rhs);
+        }
       }
     }
 
@@ -620,10 +1094,364 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
     Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
 
-    Context(const Context&) = delete;
-    void operator=(const Context&) = delete;
+    EvalParallelContext(const EvalParallelContext&) = delete;
+    void operator=(const EvalParallelContext&) = delete;
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  using SyncEvalParallelContext =
+      EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
+                          rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                          Alignment>;
+
+  // ------------------------------------------------------------------------ //
+
+  // EvalShardedByInnerDimContext orchestrates sync/async contraction
+  // evaluation, when we shard by inner dimension. When it is executed in
+  // asynchronous mode, it owns all the shared state that might be accessible by
+  // block processing tasks.
+
+  template <typename DoneCallback>
+  struct EvalShardedByInnerDimContext {
+    EvalShardedByInnerDimContext(const Self* self, int num_threads,
+                                 Scalar* result_buffer,
+                                 Index m_size, Index n_size, Index k_size,
+                                 DoneCallback done_callback)
+        : evaluator(self),
+          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+          result(result_buffer),
+          m(m_size),
+          n(n_size),
+          k(k_size),
+          done(std::move(done_callback)),
+          buffer_size_bytes(m * n * sizeof(Scalar)),
+          block_size(blockSize(k, num_threads)),
+          num_blocks(divup<Index>(k, block_size)),
+          num_pending_blocks(internal::convert_index<int>(num_blocks)),
+          l0_ranges(divup<Index>(num_blocks, l0_size)),
+          l0_state(l0_ranges),
+          block_buffers(num_blocks) {
+      // Keep count of pending gemm tasks for each l0 range.
+      for (int i = 0; i < l0_ranges; ++i) {
+        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+      }
+
+      // Allocate temporary buffers for each block.
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        Scalar* buf = block_idx == 0
+                          ? result
+                          : static_cast<Scalar*>(evaluator->m_device.allocate(
+                                buffer_size_bytes));
+        block_buffers.emplace_back(buf);
+      }
+    }
+
+    ~EvalShardedByInnerDimContext() {
+      for (Index i = 1; i < num_blocks; ++i) {
+        evaluator->m_device.deallocate(block_buffers[i]);
+      }
+    }
+
+    template <int Alignment>
+    void run() {
+      Barrier barrier(internal::convert_index<int>(num_blocks));
+      eval<Alignment>(barrier, 0, num_blocks);
+      barrier.Wait();
+
+      // Aggregate partial sums from l0 ranges.
+      aggregateL0Blocks<Alignment>();
+
+      // Apply output kernel.
+      applyOutputKernel();
+    }
+
+    template <int Alignment>
+    void runAsync() {
+      evalAsync<Alignment>(0, num_blocks);
+    }
+
+   private:
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+    const Self* evaluator;  // TensorContraction evaluator
+
+    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+    bool m_lhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_reordered;
+
+    Scalar* result;
+
+    Index m;
+    Index n;
+    Index k;
+
+    DoneCallback done;
+
+    // ----------------------------------------------------------------------//
+    // Algorithm parameters.
+
+    // We will compute partial results into the buffers of this size.
+    Index buffer_size_bytes;
+
+    Index block_size;
+    Index num_blocks;
+
+    // Keep track of pending tasks when evaluate in async mode.
+    std::atomic<int> num_pending_blocks;
+
+    // We compute partial gemm results in parallel, and to get the final result
+    // we need to add them all together. For the large number of threads (>= 48)
+    // this adds a very expensive sequential step at the end.
+    //
+    // We split the [0, num_blocks) into small ranges, and when a task for the
+    // block finishes its partial gemm computation, it checks if it was the last
+    // gemm in the range, and if so, it will add all blocks of the range.
+    //
+    // After all tasks done, we need to add only these pre-aggregated blocks.
+
+    // For now we use just a single level of ranges to compute pre-aggregated
+    // partial sums, but in general we can use more layers to compute tree
+    // aggregation in parallel and reduce the size of the sequential step.
+    //
+    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+    // sense only if number of threads >= ~128?
+    static const Index l0_size = 4;
+    Index l0_ranges;
+
+    // Keep count of pending gemm tasks for each l0 range.
+    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
+
+    // Buffers allocated for each temporary block computation.
+    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
+
+    template <int Alignment>
+    void processBlock(Index block_idx, Index begin, Index end) {
+      Scalar* buf = block_buffers[block_idx];
+
+      TENSOR_CONTRACTION_DISPATCH(
+          evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+          (buf, begin, end,
+           /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+      // Check if it was the last task in l0 range.
+      const Index l0_index = block_idx / l0_size;
+      const int v = l0_state[l0_index].fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      // If we processed the last block of the range, we can aggregate all
+      // partial results into the first block of the range.
+      if (v == 1) {
+        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+        const Index dst_block_idx = l0_index * l0_size;
+
+        if (rng_size == l0_size) {
+          addAllToBuffer<Alignment>(
+              m * n,
+              /*src_buf0=*/block_buffers[dst_block_idx + 1],
+              /*src_buf1=*/block_buffers[dst_block_idx + 2],
+              /*src_buf2=*/block_buffers[dst_block_idx + 3],
+              /*dst_buf= */ block_buffers[dst_block_idx]);
+        } else {
+          // Aggregate blocks of potentially incomplete last range.
+          for (int i = 1; i < rng_size; ++i) {
+            addToBuffer<Alignment>(m * n,
+                                   /*src_buf=*/block_buffers[dst_block_idx + i],
+                                   /*dst_buf=*/block_buffers[dst_block_idx]);
+          }
+        }
+      }
+    }
+
+    // Aggregate partial sums from l0 ranges.
+    template <int Alignment>
+    void aggregateL0Blocks() const {
+      Index l0_index = 1;
+
+      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+        addAllToBuffer<Alignment>(
+            m * n,
+            /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+            /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+            /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+            /*dst_buf= */ block_buffers[0]);
+      }
+
+      for (; l0_index < l0_ranges; ++l0_index) {
+        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
+                               block_buffers[0]);
+      }
+    }
+
+    void applyOutputKernel() const {
+      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      evaluator->m_output_kernel(
+          OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+          static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+    }
+
+    // Compute block size with accounting for potentially incomplete last block.
+    Index actualBlockSize(Index block_idx) const {
+      return block_idx + 1 < num_blocks
+                 ? block_size
+                 : k + block_size - block_size * num_blocks;
+    };
+
+    // Compute range size with accounting for potentially incomplete last range.
+    Index actualRangeSize(Index num_ranges, Index range_size,
+                          Index range_idx) const {
+      eigen_assert(range_idx < num_ranges);
+      return range_idx + 1 < num_ranges
+                 ? range_size
+                 : num_blocks + range_size - range_size * num_ranges;
+    };
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
+                                                Scalar* tgt_buf) {
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const PacketReturnType src_val =
+            internal::pload<PacketReturnType>(src_buf + i);
+        const PacketReturnType tgt_val =
+            internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+        const PacketReturnType sum = internal::padd(src_val, tgt_val);
+        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
+                                                               sum);
+      }
+      for (; i < n; ++i) {
+        tgt_buf[i] += src_buf[i];
+      }
+    }
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
+                                                   const Scalar* src_buf0,
+                                                   const Scalar* src_buf1,
+                                                   const Scalar* src_buf2,
+                                                   Scalar* dst_buf) {
+      using ::Eigen::internal::padd;
+      using ::Eigen::internal::pload;
+      using ::Eigen::internal::ploadt;
+      using ::Eigen::internal::pstoret;
+
+      const int output_packet_size =
+          internal::unpacket_traits<PacketReturnType>::size;
+
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+        const auto sum =
+            padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+      }
+    }
+
+    template <int Alignment>
+    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, &barrier, mid_block_idx, end_block_idx]() {
+              eval<Alignment>(barrier, mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+      barrier.Notify();
+    }
+
+    template <int Alignment>
+    void evalAsync(Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueueNoNotification(
+            [this, mid_block_idx, end_block_idx]() {
+              evalAsync<Alignment>(mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+
+      int v = num_pending_blocks.fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      if (v == 1) {
+        // Aggregate partial sums from l0 ranges.
+        aggregateL0Blocks<Alignment>();
+
+        // Apply output kernel.
+        applyOutputKernel();
+
+        // NOTE: If we call `done` callback before deleting this (context),
+        // it might deallocate Self* pointer captured by context, and we'll
+        // fail in destructor trying to deallocate temporary buffers.
+
+        // Move done call back from context before it will be destructed.
+        DoneCallback done_copy = std::move(done);
+
+        // We are confident that we are the last one who touches context.
+        delete this;
+
+        // Now safely call the done callback.
+        done_copy();
+      }
+    }
+
+    // Cost model doesn't capture well the cost associated with constructing
+    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+    // and gemm_pack_rhs, so we specify minimum desired block size.
+    static Index blockSize(Index k, int num_threads) {
+      const auto round_up = [=](Index index) -> Index {
+        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+        return divup<Index>(index, kmultiple) * kmultiple;
+      };
+
+      const Index target_block_size = round_up(divup<Index>(k, num_threads));
+      const Index desired_min_block_size = 12 * packet_size;
+
+      return numext::mini<Index>(
+          k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+    }
+
+    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+    void operator=(const EvalShardedByInnerDimContext&) = delete;
   };
 
+  // ------------------------------------------------------------------------ //
+
+  // Below are the function used by evalProductImpl heuristics, trying to select
+  // optimcal parameters for parallelization algorithm.
+
   // Decide whether we want to shard m x n contraction by columns or by rows.
   static bool shardByCol(Index m, Index n, Index num_threads) {
     // Note: we are comparing both n and m against Traits::nr, it is not
@@ -727,304 +1555,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return 0;
   }
 
-#else  // EIGEN_USE_SIMPLE_THREAD_POOL
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalProduct(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-      return;
-    }
-
-    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalGemm(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-
-    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    // TODO: packing could be faster sometimes if we supported row major tensor mappers
-    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
-                                    Traits::LhsProgress, ColMajor> LhsPacker;
-    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
-
-    // TODO: replace false, false with conjugate values?
-    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
-                                  Traits::mr, Traits::nr, false, false> GebpKernel;
-
-    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
-    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    // compute block sizes (which depend on number of threads)
-    const Index num_threads = this->m_device.numThreads();
-    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
-    Index mc = blocking.mc();
-    Index nc = blocking.nc();
-    Index kc = blocking.kc();
-    eigen_assert(mc <= m);
-    eigen_assert(nc <= n);
-    eigen_assert(kc <= k);
-
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-    const Index k_blocks = CEIL_DIV(k, kc);
-    const Index n_blocks = CEIL_DIV(n, nc);
-    const Index m_blocks = CEIL_DIV(m, mc);
-    const Index sizeA = mc * kc;
-    const Index sizeB = kc * nc;
-
-    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
-    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
-    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
-    cout << "num threads: " << num_threads << endl;
-    */
-
-    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
-    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
-    // note: You can get away with allocating just a single blockA and offsets and meet the
-    //       the alignment requirements with the assumption that
-    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
-    const Index numBlockAs = numext::mini(num_threads, m_blocks);
-    MaxSizeVector<LhsScalar *> blockAs(num_threads);
-    for (int i = 0; i < num_threads; i++) {
-      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
-    }
-
-    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
-    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
-    //       Other options: (1) reuse memory when a thread finishes. con: tricky
-    //                      (2) allocate block B memory in each thread. con: overhead
-    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
-    for (int i = 0; i < n_blocks; i++) {
-      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
-    }
-
-    // lhs_notifications starts with all null Notifications
-    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
-
-    // this should really be numBlockAs * n_blocks;
-    const Index num_kernel_notifications = num_threads * n_blocks;
-    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
-                                                    nullptr);
-
-    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
-      const Index k_start = k_block_idx * kc;
-      // make sure we don't overshoot right edge of left matrix
-      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
-
-      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
-        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
-
-        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
-          const Index m_start = mt_block_idx * mc;
-          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
-          eigen_assert(actual_mc > 0);
-
-          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
-
-          for (int i = 0; i < n_blocks; ++i) {
-            Index notification_id = (blockAId * n_blocks + i);
-            // Wait for any current kernels using this slot to complete
-            // before using it.
-            if (kernel_notifications[notification_id]) {
-              wait_until_ready(kernel_notifications[notification_id]);
-              delete kernel_notifications[notification_id];
-            }
-            kernel_notifications[notification_id] = new Notification();
-          }
-          const packLArg arg = {
-            blockAs[blockAId], // blockA
-            lhs,        // lhs
-            m_start,    // m
-            k_start,    // k
-            actual_mc,  // mc
-            actual_kc,  // kc
-          };
-
-          // Delete any existing notification since we may be
-          // replacing it.  The algorithm should ensure that there are
-          // no existing waiters on this notification.
-          delete lhs_notifications[blockAId];
-          lhs_notifications[blockAId] =
-          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
-        }
-
-        // now start kernels.
-        const Index m_base_start = m_block_idx * mc;
-        const bool need_to_pack = m_block_idx == 0;
-
-        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
-          const Index n_start = n_block_idx * nc;
-          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
-
-          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
-          // we're going to start new k. In both cases need_to_pack is true.
-          if (need_to_pack) {
-            for (Index i = num_blocks; i < num_threads; ++i) {
-              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
-              Index future_id = (blockAId * n_blocks + n_block_idx);
-              wait_until_ready(kernel_notifications[future_id]);
-            }
-          }
-
-          packRKArg arg = {
-            &blockAs, // blockA
-            blockBs[n_block_idx], // blockB
-            rhs,          // rhs
-            output,       // output
-            m_base_start, // m
-            k_start,      // k
-            n_start,      // n
-            mc,           // mc
-            actual_kc,    // kc
-            actual_nc,    // nc
-            num_threads,
-            numBlockAs,
-            m,
-            k_block_idx,
-            m_block_idx,
-            n_block_idx, // n_block_idx
-            m_blocks, // m_blocks
-            n_blocks, // n_blocks
-            &kernel_notifications, // kernel notifications
-            &lhs_notifications,    // lhs notifications
-            need_to_pack, // need_to_pack
-          };
-
-          // We asynchronously kick off this function, which ends up
-          // notifying the appropriate kernel_notifications objects,
-          // which this thread waits on before exiting.
-          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
-        }
-      }
-    }
-
-    // Make sure all the kernels are done.
-    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
-      wait_until_ready(kernel_notifications[i]);
-      delete kernel_notifications[i];
-    }
-
-    // No need to wait for lhs notifications since they should have
-    // already been waited on.  Just clean them up.
-    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
-      delete lhs_notifications[i];
-    }
-
-    // deallocate all of the memory for both A and B's
-    for (size_t i = 0; i < blockAs.size(); i++) {
-      this->m_device.deallocate(blockAs[i]);
-    }
-    for (size_t i = 0; i < blockBs.size(); i++) {
-      this->m_device.deallocate(blockBs[i]);
-    }
-
-#undef CEIL_DIV
-  }
-
-  /*
-   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
-   * the LHS block, check that all of the kernels that worked on the same
-   * mt_block_idx in the previous m_block are done.
-   */
-  template <typename packLArg, typename LhsPacker>
-  static void packLhs(const packLArg arg) {
-    // perform actual packing
-    LhsPacker pack_lhs;
-    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
-  }
-
-  /*
-   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
-   * all kernels in the previous block are done.
-   * Then for each LHS future, we wait on the future and then call GEBP
-   * on the area packed by the future (which starts at
-   * blockA + future_idx * mt * kc) on the LHS and with the full packed
-   * RHS block.
-   * The output of this GEBP is written to output(m + i * mt, n).
-   */
-  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
-  static void packRhsAndKernel(packRKArg arg) {
-    if (arg.need_to_pack) {
-      RhsPacker pack_rhs;
-      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
-    }
-
-    GebpKernel gebp;
-    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
-      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
-      if (m_base_start < arg.max_m) {
-        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
-        wait_until_ready((*arg.lhs_notifications)[blockAId]);
-        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
-        gebp(arg.output.getSubMapper(m_base_start, arg.n),
-             (*arg.blockAs)[blockAId], arg.blockB,
-             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
-
-        // Notify that the kernel is done.
-        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
-        (*arg.kernel_notifications)[set_idx]->Notify();
-      }
-    }
-  }
-#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
-
   TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                                bool shard_by_col, bool prepacked) const {
     const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
                                           PacketType<RhsScalar, Device>::size);
     const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
     const double kd = static_cast<double>(bk);
-    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
-    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
-    // experimentally.
-    double computeBandwidth = bk == 1 ? 4.0 :
-          (shard_by_col ? bn : bm) < Traits::nr ||
-          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
-#ifndef EIGEN_VECTORIZE_FMA
-    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
-    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
-    // so overall bandwidth is 1.0.
-    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
-#endif
+    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
     // Computations.
-    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
     // Output stores.
     cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
     if (prepacked) {
@@ -1044,6 +1583,94 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       rhsCost.dropMemoryCost();
     return cost + lhsCost + rhsCost;
   }
+
+  // Decide whether we want to shard m x k x n contraction over the inner
+  // (contraction) dimension (k).
+  static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
+                              int num_threads_by_k) {
+    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
+    bool shard_by_k = false;
+    if (n == 1 ||                // If mat*vec or...
+        num_threads_by_k < 2 ||  // running single threaded or...
+        num_threads_by_k <
+            num_threads ||  // sharding by k gives less parallelism or...
+        bufsize > l3CacheSize() / num_threads_by_k ||  // need more buffer space
+        // than L3 cache or...
+        k / num_threads_by_k < 2 * Traits::nr) {  // k per thread is tiny.
+      shard_by_k = false;
+    } else if (numext::maxi(m, n) / num_threads <
+                   Traits::nr ||  // both other dimensions are tiny or...
+               // k per thread is not small and...
+               (k / num_threads_by_k > 8 * Traits::nr &&
+                // one of the outer dimensions is tiny or sharding by k offers
+                // more parallelism.
+                (numext::mini(m, n) < 2 * Traits::nr ||
+                 num_threads_by_k > num_threads))) {
+      shard_by_k = true;
+    }
+    return shard_by_k;
+  }
+
+  TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
+    // Compute cost.
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
+    // Since the inner gemm kernel is always sharded by column, the lhs
+    // load cost is negligible.
+    lhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+
+  int numThreadsInnerDim(Index m, Index n, Index k) const {
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
+    double total_parallel_cost =
+        TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
+    // Cost of reduction step accumulating the m*n per-thread buffers into the
+    // result.
+    double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost(
+        m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
+    int num_threads = 1;
+    double min_cost = total_parallel_cost;
+    double kPerThreadOverHead = 3000;
+    double kFixedOverHead = 100000;
+    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
+      double sequential_cost =
+          kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
+      double parallel_cost = total_parallel_cost / nt + sequential_cost;
+      if (parallel_cost < min_cost) {
+        num_threads = nt;
+        min_cost = parallel_cost;
+      }
+    }
+    return num_threads;
+  }
+
+  double computeBandwidth(bool shard_by_col, Index bm, Index bn,
+                          Index bk) const {
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth =
+        bk == 1 ? 4.0
+                : (shard_by_col ? bn : bm) < Traits::nr ||
+                          (shard_by_col ? bm : bn) < Traits::mr
+                      ? 2.0
+                      : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such
+    // instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    return computeBandwidth;
+  }
+
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 860a6949a..09d2da9a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -32,6 +32,7 @@ struct traits<TensorConversionOp<TargetType, XprType> >
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
   enum { Flags = 0 };
+  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
 };
 
 template<typename TargetType, typename XprType>
@@ -50,7 +51,10 @@ struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorCo
 
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
-struct PacketConverter {
+struct PacketConverter;
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl) {}
@@ -108,7 +112,33 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
 };
 
 template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl) {}
+
+  template<int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
+    SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
+    SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
+    SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketConverter(const TensorEvaluator& impl)
       : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
@@ -128,6 +158,7 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
       typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
       internal::scalar_cast_op<SrcType, TgtType> converter;
       EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < TgtPacketSize; ++i) {
         values[i] = converter(m_impl.coeff(index+i));
       }
@@ -163,19 +194,114 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
     typename XprType::Nested m_xpr;
 };
 
-template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
+template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
     impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 };
 
-template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
+template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
     return impl.evalSubExprsIfNeeded(data);
   }
 };
 
+#ifdef EIGEN_USE_THREADS
+template <bool SameType, typename Eval, typename EvalPointerType,
+          typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
+  }
+};
+
+template <typename Eval, typename EvalPointerType,
+          typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
+                                  EvalSubExprsCallback> {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+};
+#endif
+
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(impl.coeff(index));
+  }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    return impl.coeff(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = converter(impl.coeff(index+i));
+    }
+    TargetPacket rslt = internal::pload<TargetPacket>(values);
+    return rslt;
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
+                    SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+    return converter.template packet<LoadMode>(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
+    return internal::pload<TargetPacket>(values);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+    return impl.template packet<LoadMode>(index);
+  }
+};
+
+}  // namespace internal
 
 // Eval as rvalue
 template<typename TargetType, typename ArgType, typename Device>
@@ -189,44 +315,98 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename PacketType<SrcType, Device>::type PacketSourceType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = true,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      =
+    #ifndef EIGEN_USE_SYCL
+                        true,
+    #else
+                        TensorEvaluator<ArgType, Device>::PacketAccess &
+                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    #endif
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
+  };
+
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  struct TensorConversionOpBlockFactory {
+    template <typename ArgXprType>
+    struct XprType {
+      typedef TensorConversionOp<TargetType, const ArgXprType> type;
+    };
+
+    template <typename ArgXprType>
+    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+      return typename XprType<ArgXprType>::type(expr);
+    }
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
+                                         ArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
   {
-    return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
+    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType data, EvalSubExprsCallback done) {
+    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
+                               EvaluatorPointerType,
+        EvalSubExprsCallback>::run(m_impl, data, std::move(done));
   }
+#endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    return converter(m_impl.coeff(index));
+    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
-        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
-    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+  packet(Index index) const {
+    // If we are not going to do the cast, we just need to check that base
+    // TensorEvaluator has packet access. Otherwise we also need to make sure,
+    // that we have an implementation of vectorized cast.
+    const bool Vectorizable =
+        IsSameType
+        ? TensorEvaluator<ArgType, Device>::PacketAccess
+        : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+          int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
+
+    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
+                                Vectorizable, IsSameType>::run(m_impl, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -244,33 +424,30 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     }
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
 
-  protected:
-  template <int LoadMode, bool ActuallyVectorize>
-  struct PacketConv {
-    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-      internal::scalar_cast_op<SrcType, TargetType> converter;
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      for (int i = 0; i < PacketSize; ++i) {
-        values[i] = converter(impl.coeff(index+i));
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_impl.block(desc, scratch),
+                         TensorConversionOpBlockFactory());
+  }
 
-  template <int LoadMode>
-  struct PacketConv<LoadMode, true> {
-    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
-                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
-      return converter.template packet<LoadMode>(index);
-    }
-  };
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
+ protected:
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index abdf742c6..b20f80ba2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -54,8 +54,8 @@ class IndexMapper {
       }
     }
 
-    array<Index, NumDims> cudaInputDimensions;
-    array<Index, NumDims> cudaOutputDimensions;
+    array<Index, NumDims> gpuInputDimensions;
+    array<Index, NumDims> gpuOutputDimensions;
     array<Index, NumDims> tmp = dimensions;
     array<Index, NumDims> ordering;
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -65,8 +65,8 @@ class IndexMapper {
       const Index index = i + offset;
       ordering[index] = indices[i];
       tmp[indices[i]] = -1;
-      cudaInputDimensions[index] = input_dims[indices[i]];
-      cudaOutputDimensions[index] = dimensions[indices[i]];
+      gpuInputDimensions[index] = input_dims[indices[i]];
+      gpuOutputDimensions[index] = dimensions[indices[i]];
     }
 
     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -75,8 +75,8 @@ class IndexMapper {
     for (int i = 0; i < NumDims; ++i) {
       if (tmp[i] >= 0) {
         ordering[written] = i;
-        cudaInputDimensions[written] = input_dims[i];
-        cudaOutputDimensions[written] = dimensions[i];
+        gpuInputDimensions[written] = input_dims[i];
+        gpuOutputDimensions[written] = dimensions[i];
         ++written;
       }
     }
@@ -89,37 +89,37 @@ class IndexMapper {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = 0; i < NumDims; ++i) {
         if (i > NumKernelDims) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
-        if (i + 1 < offset) {
-          m_cudaInputStrides[i] =
-              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
-          m_cudaOutputStrides[i] =
-              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+        if (static_cast<size_t>(i + 1) < offset) {
+          m_gpuInputStrides[i] =
+              m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+          m_gpuOutputStrides[i] =
+              m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
         } else {
-          m_cudaInputStrides[i] = 1;
-          m_cudaOutputStrides[i] = 1;
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
         }
       }
     }
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[NumKernelDims];
     } else {
@@ -128,22 +128,22 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaInputStrides[d];
+        const Index idx = p / m_gpuInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_cudaInputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
       }
       inputIndex += p * m_inputStrides[limit];
     }
     return inputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
     Index outputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[NumKernelDims];
     } else {
@@ -152,44 +152,44 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_cudaOutputStrides[d];
+        const Index idx = p / m_gpuOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_cudaOutputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[limit];
     }
     return outputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -197,7 +197,7 @@ class IndexMapper {
            k * m_inputStrides[offset + 2];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -209,8 +209,8 @@ class IndexMapper {
   static const int NumDims = internal::array_size<InputDims>::value;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_cudaInputStrides;
-  array<Index, NumDims> m_cudaOutputStrides;
+  array<Index, NumDims> m_gpuInputStrides;
+  array<Index, NumDims> m_gpuOutputStrides;
 };
 
 
@@ -231,6 +231,8 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<InputXprType>::NumDimensions;
   static const int Layout = traits<InputXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
 
   enum {
     Flags = 0
@@ -300,17 +302,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -374,12 +384,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     preloadKernel();
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
     if (m_local_kernel) {
       m_device.deallocate((void*)m_kernel);
@@ -465,7 +475,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
                                        PacketSize));
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
  private:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -521,11 +531,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
       m_local_kernel = false;
     } else {
       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
+      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
@@ -544,14 +554,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   KernelArgType m_kernelArg;
   const Scalar* m_kernel;
   bool m_local_kernel;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
 
 
 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
 
 template <int StaticKernelSize>
 struct GetKernelSize {
@@ -568,13 +578,17 @@ struct GetKernelSize<Dynamic> {
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int kernelSize, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -586,18 +600,18 @@ __global__ void EigenConvolutionKernel1D(
 
   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
     // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.y * num_x_input;
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
     }
 
     __syncthreads();
 
     // Compute the convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@@ -607,7 +621,7 @@ __global__ void EigenConvolutionKernel1D(
       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
         result += s[k + kernel_offset] * kernel[k];
       }
-      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
       buffer[tensor_index] = result;
     }
     __syncthreads();
@@ -616,14 +630,18 @@ __global__ void EigenConvolutionKernel1D(
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int numY, const int maxY, const int kernelSizeX,
     const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -640,7 +658,7 @@ __global__ void EigenConvolutionKernel2D(
 
   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.z * num_y_input;
 
     // Load inputs to shared memory
@@ -649,7 +667,7 @@ __global__ void EigenConvolutionKernel2D(
       const int input_offset = num_x_input * (j + plane_kernel_offset);
       #pragma unroll
       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
         s[i + input_offset] = eval.coeff(tensor_index);
       }
     }
@@ -657,7 +675,7 @@ __global__ void EigenConvolutionKernel2D(
     __syncthreads();
 
     // Convolution
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -673,7 +691,7 @@ __global__ void EigenConvolutionKernel2D(
             result += s[k + input_offset] * kernel[k + kernel_offset];
           }
         }
-        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
         buffer[tensor_index] = result;
       }
     }
@@ -683,7 +701,7 @@ __global__ void EigenConvolutionKernel2D(
 };
 
 template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
         indexMapper,
@@ -691,7 +709,11 @@ __global__ void EigenConvolutionKernel3D(
     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
     const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
   extern __shared__ float s[];
+#endif
 
   // Load inputs to shared memory
   const int first_x = blockIdx.x * maxX;
@@ -708,13 +730,13 @@ __global__ void EigenConvolutionKernel3D(
 
   for (int p = 0; p < numPlanes; ++p) {
 
-    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
         }
       }
@@ -726,7 +748,7 @@ __global__ void EigenConvolutionKernel3D(
     const int num_z_output = last_z - first_z + 1;
     const int num_y_output = last_y - first_y + 1;
     const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
 
     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -739,7 +761,7 @@ __global__ void EigenConvolutionKernel3D(
               }
             }
           }
-          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
           buffer[tensor_index] = result;
         }
       }
@@ -764,13 +786,19 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -852,9 +880,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
 
     const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
     const int warpSize = 32;
 
     switch (NumKernelDims) {
@@ -889,7 +917,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         }
 
         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
@@ -906,15 +934,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
             m_inputImpl.dimensions(), kernel_dims, indices);
         switch(kernel_size) {
           case 4: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
             break;
           }
           case 7: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
           }
         }
         break;
@@ -946,7 +974,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
 
         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int num_y_blocks = ceil(numY, maxY);
@@ -967,11 +995,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 4: {
             switch (kernel_size_y) {
               case 7: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
                 break;
               }
             }
@@ -980,18 +1008,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 7: {
             switch (kernel_size_y) {
               case 4: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
                 break;
               }
               default: {
-                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
                 break;
               }
             }
             break;
           }
           default: {
-            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
             break;
           }
         }
@@ -1026,7 +1054,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
 
         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        assert(shared_mem <= maxSharedMem);
+        gpu_assert(shared_mem <= maxSharedMem);
 
         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
@@ -1037,7 +1065,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
             m_inputImpl.dimensions(), kernel_dims, indices);
 
-        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
         break;
       }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 000000000..033318fdc
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -0,0 +1,544 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV1D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+  const size_t kernelSize;
+  const cl::sycl::range<2> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernelSize(kernelSize_),
+        input_range(input_range_) {}
+
+  template <typename BooleanDim2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+    return (boolean_check[0] && boolean_check[1]);
+  }
+  void operator()(cl::sycl::nd_item<2> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
+    /// fill the shared memory
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset;
+      const size_t tensor_index =
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+      local_acc[local_index] =
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+              ? device_evaluator.coeff(tensor_index)
+              : CoeffReturnType(0);
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
+      for (size_t k = 0; k < kernelSize; ++k) {
+        result += (local_acc[k + index] * kernel_ptr[k]);
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV2D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<2> kernel_size;
+  const cl::sycl::range<3> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    // the required row to be calculated for the for each plane in shered memory
+    const auto num_input = cl::sycl::range<2>{
+        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
+      
+    // fill the local memory
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
+      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+        const size_t local_index = i + local_input_offset;
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                             i + input_offset[0], j + input_offset[1]);
+        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
+                                  in_range_dim1 && in_range_dim2)
+                                     ? device_evaluator.coeff(tensor_index)
+                                     : CoeffReturnType(0);
+      }
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // output offset start for each thread
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+
+      for (size_t j = 0; j < kernel_size[1]; j++) {
+        size_t kernel_offset = kernel_size[0] * j;
+        const size_t index =
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+        for (size_t i = 0; i < kernel_size[0]; i++) {
+          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
+        }
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+                                                             itemID.get_local_id(1) + output_offset[1]);
+
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV3D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<3> kernel_size;
+  const cl::sycl::range<3> input_range;
+  const size_t numP;
+
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+                         const size_t numP_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_),
+        numP(numP_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+  void operator()(cl::sycl::nd_item<3> itemID) {
+    auto buffer_ptr = buffer_acc.get_pointer();
+    auto kernel_ptr = kernel_filter.get_pointer();
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
+
+    const auto output_offset =
+          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
+
+    for (size_t p = 0; p < numP; p++) {
+      /// fill the shared memory
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+            const size_t local_index = local_index_dim1 + i;
+            const size_t tensor_index =
+                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
+          }
+        }
+      }
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+      // calculate the convolution
+
+      if (boundary_check(itemID.get_global_id() < input_range)) {
+        CoeffReturnType result = static_cast<CoeffReturnType>(0);
+        for (size_t k = 0; k < kernel_size[2]; k++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+              const size_t local_index =
+                  ((i + itemID.get_local_id(0)) +
+                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
+              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+            }
+          }
+        }
+        const size_t tensor_index =
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+        buffer_ptr[tensor_index] = result;
+      }
+
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static const int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
+  static const int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef const Eigen::SyclDevice Device;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
+
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+        m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (EvaluatorPointerType)m_device.get(
+          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate_temp(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate_temp(m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
+    typedef typename InputEvaluator::Dimensions InputDims;
+    switch (NumKernelDims) {
+      case 1: {
+        const size_t numX = dimensions()[m_indices[0]];
+        const size_t numP = dimensions().TotalSize() / numX;
+        const auto input_dim = std::array<size_t, 2>{numX, numP};
+        auto global_range = cl::sycl::range<2>{};
+        auto local_range = cl::sycl::range<2>{};
+        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 1> indices{{m_indices[0]}};
+        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+            ConvKernel;
+
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+        break;
+      }
+
+      case 2: {
+        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numP = dimensions().TotalSize() / (numX * numY);
+        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+        const size_t local_memory_size =
+            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+        const array<Index, 2> kernel_dims{
+            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+        break;
+      }
+
+      case 3: {
+        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const array<Index, 3> indices{
+            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+                                           m_kernelImpl.dimensions()[kernel_index[1]],
+                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        auto global_range = cl::sycl::range<3>{};
+        auto local_range = cl::sycl::range<3>{};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        auto local_memory_range = (local_range + kernel_size - 1);
+        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+            ConvKernel;
+        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_kernelImpl.bind(cgh);
+    m_inputImpl.bind(cgh);
+    m_buf.bind(cgh);
+    m_kernel.bind(cgh);
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator &operator=(const TensorEvaluator &);
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
+  KernelArgType m_kernelArg;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  EvaluatorPointerType m_buf;
+  typename KernelStorage::Type m_kernel;
+  bool m_local_kernel;
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+};  // namespace Eigen
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 83c449cf1..195267ce8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -174,8 +174,11 @@ class TensorCostModel {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
     double cost = totalCost(output_size, cost_per_coeff);
-    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
-    return numext::mini(max_threads, numext::maxi(1, threads));
+    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    // Make sure we don't invoke undefined behavior when we convert to an int.
+    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+    return numext::mini(max_threads,
+                        numext::maxi<int>(1, static_cast<int>(threads)));
   }
 
   // taskSize assesses parallel task size.
@@ -186,14 +189,13 @@ class TensorCostModel {
     return totalCost(output_size, cost_per_coeff) / kTaskSize;
   }
 
- private:
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
       double output_size, const TensorOpCost& cost_per_coeff) {
     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not intersting
+    // is too small for parallelization, larger time is not interesting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index e020d076f..95a8a84ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -30,12 +30,13 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
 struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
 {
-  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
@@ -86,18 +87,26 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.expression());
@@ -105,21 +114,21 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<CoeffReturnType*>(
-          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result != NULL) {
-      m_device.deallocate(m_result);
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
@@ -138,19 +147,25 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_result.bind(cgh);
+  }
+#endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
-        data, m_dimensions);
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.expression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const ArgType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
 };
 
 
@@ -180,6 +195,8 @@ struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
@@ -242,18 +259,27 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
@@ -261,20 +287,21 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+        m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
       evalTo(m_result);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_result != NULL) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
@@ -293,18 +320,25 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_result.bind(cgh);
+  }
+#endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
-    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const XprType m_op;
-  const Device& m_device;
-  CoeffReturnType* m_result;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 29e50a3b2..96fa46c86 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -28,6 +28,8 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
   public:
     TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
 
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
     template<typename OtherDerived>
     EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
       typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
@@ -63,6 +65,73 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
     ExpressionType& m_expression;
 };
 
+/** \class TensorAsyncDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its
+ * argument asynchronously on the specified device. Currently only
+ * ThreadPoolDevice implements proper asynchronous execution, while the default
+ * and GPU devices just run the expression synchronously and call m_done() on
+ * completion..
+ *
+ * Example:
+ *    auto done = []() { ... expression evaluation done ... };
+ *    C.device(thread_pool_device, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice {
+ public:
+  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
+                    DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
+
+    Assign assign(m_expression, other);
+    Executor::run(assign, m_device);
+    m_done();
+
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType, typename DoneCallback>
+class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
+ public:
+  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
+                    DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
+
+    // WARNING: After assignment 'm_done' callback will be in undefined state.
+    Assign assign(m_expression, other);
+    Executor::runAsync(assign, m_device, std::move(m_done));
+
+    return *this;
+  }
+
+ protected:
+  const ThreadPoolDevice& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+#endif
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 4f5767bc7..f77923933 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -1,337 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
-
-namespace Eigen {
-
-static const int kCudaScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// CUDA streams underneath.
-class StreamInterface {
- public:
-  virtual ~StreamInterface() {}
-
-  virtual const cudaStream_t& stream() const = 0;
-  virtual const cudaDeviceProp& deviceProperties() const = 0;
-
-  // Allocate memory on the actual device where the computation will run
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-
-  // Return a scratchpad buffer of size 1k
-  virtual void* scratchpad() const = 0;
-
-  // Return a semaphore. The semaphore is initially initialized to 0, and
-  // each kernel using it is responsible for resetting to 0 upon completion
-  // to maintain the invariant that the semaphore is always equal to 0 upon
-  // each kernel start.
-  virtual unsigned int* semaphore() const = 0;
-};
-
-static cudaDeviceProp* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-#if __cplusplus >= 201103L
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-#else
-    static bool first = true;
-    if (first) {
-      first = false;
-#endif
-      // We're the first thread to reach this point.
-      int num_devices;
-      cudaError_t status = cudaGetDeviceCount(&num_devices);
-      if (status != cudaSuccess) {
-        std::cerr << "Failed to get the number of CUDA devices: "
-                  << cudaGetErrorString(status)
-                  << std::endl;
-        assert(status == cudaSuccess);
-      }
-      m_deviceProperties = new cudaDeviceProp[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
-        if (status != cudaSuccess) {
-          std::cerr << "Failed to initialize CUDA device #"
-                    << i
-                    << ": "
-                    << cudaGetErrorString(status)
-                    << std::endl;
-          assert(status == cudaSuccess);
-        }
-      }
-
-#if __cplusplus >= 201103L
-      std::atomic_thread_fence(std::memory_order_release);
-#endif
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-#if __cplusplus >= 201103L
-        std::atomic_thread_fence(std::memory_order_acquire);
-#endif
-        sleep(1);
-      }
-    }
-  }
-}
-
-static const cudaStream_t default_stream = cudaStreamDefault;
-
-class CudaStreamDevice : public StreamInterface {
- public:
-  // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
-    cudaGetDevice(&device_);
-    initializeDeviceProp();
-  }
-  // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
-  // Use the specified stream. Note that it's the
-  // caller responsibility to ensure that the stream can run on
-  // the specified device. If no device is specified the code
-  // assumes that the stream is associated to the current gpu device.
-  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    if (device < 0) {
-      cudaGetDevice(&device_);
-    } else {
-      int num_devices;
-      cudaError_t err = cudaGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == cudaSuccess);
-      assert(device < num_devices);
-      device_ = device;
-    }
-    initializeDeviceProp();
-  }
-
-  virtual ~CudaStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
-    }
-  }
-
-  const cudaStream_t& stream() const { return *stream_; }
-  const cudaDeviceProp& deviceProperties() const {
-    return m_deviceProperties[device_];
-  }
-  virtual void* allocate(size_t num_bytes) const {
-    cudaError_t err = cudaSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-    void* result;
-    err = cudaMalloc(&result, num_bytes);
-    assert(err == cudaSuccess);
-    assert(result != NULL);
-    return result;
-  }
-  virtual void deallocate(void* buffer) const {
-    cudaError_t err = cudaSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-    assert(buffer != NULL);
-    err = cudaFree(buffer);
-    assert(err == cudaSuccess);
-  }
-
-  virtual void* scratchpad() const {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  virtual unsigned int* semaphore() const {
-    if (semaphore_ == NULL) {
-      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
-      EIGEN_UNUSED_VARIABLE(err)
-      assert(err == cudaSuccess);
-    }
-    return semaphore_;
-  }
-
- private:
-  const cudaStream_t* stream_;
-  int device_;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
-  // The StreamInterface is not owned: the caller is
-  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
-    eigen_assert(stream);
-  }
-  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
-    eigen_assert(stream);
-  }
-  // TODO(bsteiner): This is an internal API, we should not expose it.
-  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
-    return stream_->stream();
-  }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void* scratchpad() const {
-    return stream_->scratchpad();
-  }
-
-  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-    return stream_->semaphore();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
-    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
-                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    cudaError_t err =
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-  }
-
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    cudaError_t err =
-        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef __CUDA_ARCH__
-    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    assert(err == cudaSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
-    return 32;
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on cuda devices.
-    return firstLevelCacheSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
-    cudaError_t err = cudaStreamSynchronize(stream_->stream());
-    if (err != cudaSuccess) {
-      std::cerr << "Error detected in CUDA stream: "
-                << cudaGetErrorString(err)
-                << std::endl;
-      assert(err == cudaSuccess);
-    }
-#else
-    assert(false && "The default device should be used instead to generate kernel code");
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
 #endif
-  }
-
-  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-    return stream_->deviceProperties().multiProcessorCount;
-  }
-  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-    return stream_->deviceProperties().maxThreadsPerBlock;
-  }
-  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
-  }
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return stream_->deviceProperties().major;
-  }
-  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-    return stream_->deviceProperties().minor;
-  }
-
-  EIGEN_STRONG_INLINE int maxBlocks() const {
-    return max_blocks_;
-  }
-
-  // This function checks if the CUDA runtime recorded an error for the
-  // underlying stream device.
-  inline bool ok() const {
-#ifdef __CUDACC__
-    cudaError_t error = cudaStreamQuery(stream_->stream());
-    return (error == cudaSuccess) || (error == cudaErrorNotReady);
-#else
-    return false;
-#endif
-  }
-
- private:
-  const StreamInterface* stream_;
-  int max_blocks_;
-};
-
-#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
-  assert(cudaGetLastError() == cudaSuccess);
-
-
-// FIXME: Should be device and kernel specific.
-#ifdef __CUDACC__
-static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
-#ifndef __CUDA_ARCH__
-  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status)
-  assert(status == cudaSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-}  // end namespace Eigen
 
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
+#include "TensorDeviceGpu.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 9d141395b..46b9d3ab2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -21,6 +21,12 @@ struct DefaultDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
   }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return allocate(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    deallocate(buffer);
+  }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
   }
@@ -33,11 +39,18 @@ struct DefaultDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+    return data;
+  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
     // Running on the host CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
 #else
     // Running on a CUDA device
     return 32;
@@ -45,9 +58,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
     // Running on the host CPU
     return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 48*1024; // FIXME : update this number for HIP
 #else
     // Running on a CUDA device, return the amount of shared memory available.
     return 48*1024;
@@ -55,9 +71,12 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
     // Running single threaded on the host CPU
     return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return firstLevelCacheSize(); // FIXME : update this number for HIP
 #else
     // Running on a CUDA device
     return firstLevelCacheSize();
@@ -65,13 +84,17 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
     // Running single threaded on the host CPU
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
 #else
     // Running on a CUDA device
-    return __CUDA_ARCH__ / 100;
+    return EIGEN_CUDA_ARCH / 100;
 #endif
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 000000000..ec2e3cb14
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// This header file container defines fo gpu* macros which will resolve to
+// their equivalent hip* or cuda* versions depending on the compiler in use
+// A separate header (included at the end of this file) will undefine all 
+#include "TensorGpuHipCudaDefines.h"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const gpuStream_t& stream() const = 0;
+  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : 
+      initialized_(false), first_(true), device_properties_(nullptr) {}
+ 
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+  
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+    return device_properties_[device];
+  }
+
+  EIGEN_STRONG_INLINE bool isInitialized() const {
+    return initialized_;
+  }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
+        if (status != gpuSuccess) {
+          std::cerr << "Failed to get the number of GPU devices: "
+                    << gpuGetErrorString(status)
+                    << std::endl;
+          gpu_assert(status == gpuSuccess);
+        }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #"
+                      << i
+                      << ": "
+                      << gpuGetErrorString(status)
+                      << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+      }
+    }
+  }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    gpuGetDevice(&device_);
+  }
+  // Use the default stream on the specified device
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      gpuGetDevice(&device_);
+    } else {
+      int num_devices;
+      gpuError_t err = gpuGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+      gpu_assert(device < num_devices);
+      device_ = device;
+    }
+  }
+
+  virtual ~GpuStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const gpuStream_t& stream() const { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const {
+    return GetGpuDeviceProperties(device_);
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    void* result;
+    err = gpuMalloc(&result, num_bytes);
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(buffer != NULL);
+    err = gpuFree(buffer);
+    gpu_assert(err == gpuSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const gpuStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    gpuError_t err =
+        gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip/cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuStreamSynchronize(stream_->stream());
+    if (err != gpuSuccess) {
+      std::cerr << "Error detected in GPU stream: "
+                << gpuGetErrorString(err)
+                << std::endl;
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the GPU runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef EIGEN_GPUCC
+    gpuError_t error = gpuStreamQuery(stream_->stream());
+    return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+  gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+ 
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
+  gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+ 
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "TensorGpuHipCudaUndefines.h"
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 7c039890e..df591c21d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -14,109 +14,1035 @@
 
 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+#include <unordered_set>
 
 namespace Eigen {
-struct SyclDevice {
-  /// class members
-  /// sycl queue
-  mutable cl::sycl::queue m_queue;
-  /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
-  /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
-  mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
-  /// creating device by using selector
-  template<typename dev_Selector> SyclDevice(dev_Selector s)
-  :
-#ifdef EIGEN_EXCEPTIONS
-  m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
-    for (const auto& e : l) {
-      try {
-        std::rethrow_exception(e);
-      } catch (cl::sycl::exception e) {
-          std::cout << e.what() << std::endl;
+
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+  SyclDeviceInfo(cl::sycl::queue queue)
+      : local_mem_type(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_type>()),
+        max_work_item_sizes(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_item_sizes>()),
+        max_mem_alloc_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_mem_alloc_size>()),
+        max_compute_units(queue.get_device()
+                              .template get_info<
+                                  cl::sycl::info::device::max_compute_units>()),
+        max_work_group_size(
+            queue.get_device()
+                .template get_info<
+                    cl::sycl::info::device::max_work_group_size>()),
+        local_mem_size(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::local_mem_size>()),
+        platform_name(queue.get_device()
+                          .get_platform()
+                          .template get_info<cl::sycl::info::platform::name>()),
+        device_name(queue.get_device()
+                        .template get_info<cl::sycl::info::device::name>()),
+        device_vendor(
+            queue.get_device()
+                .template get_info<cl::sycl::info::device::vendor>()) {}
+
+  cl::sycl::info::local_mem_type local_mem_type;
+  cl::sycl::id<3> max_work_item_sizes;
+  unsigned long max_mem_alloc_size;
+  unsigned long max_compute_units;
+  unsigned long max_work_group_size;
+  size_t local_mem_size;
+  std::string platform_name;
+  std::string device_name;
+  std::string device_vendor;
+};
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+
+typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
+    -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+  return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+  std::vector<cl::sycl::device> supported_devices;
+  auto platform_list = cl::sycl::platform::get_platforms();
+  for (const auto &platform : platform_list) {
+    auto device_list = platform.get_devices();
+    auto platform_name =
+        platform.template get_info<cl::sycl::info::platform::name>();
+    std::transform(platform_name.begin(), platform_name.end(),
+                   platform_name.begin(), ::tolower);
+    for (const auto &device : device_list) {
+      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+      bool unsupported_condition =
+          (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+           vendor.find("apu") == std::string::npos) ||
+          (platform_name.find("experimental") != std::string::npos) ||
+          device.is_host();
+      if (!unsupported_condition) {
+        supported_devices.push_back(device);
+      }
+    }
+  }
+  return supported_devices;
+#endif
+}
+
+class QueueInterface {
+ public:
+  /// Creating device by using cl::sycl::selector or cl::sycl::device.
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(dev_or_sel, handler),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+    m_prog.build_with_kernel_type<DeviceOrSelector>();
+    auto f = [&](cl::sycl::handler &cgh) {
+      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
+                                        [=]() {})
+    };
+    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
+#endif
+  }
+
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(
+      const DeviceOrSelector &dev_or_sel,
+      unsigned num_threads = std::thread::hardware_concurrency())
+      : QueueInterface(dev_or_sel,
+                       [this](cl::sycl::exception_list l) {
+                         this->exception_caught_ = this->sycl_async_handler(l);
+                       },
+                       num_threads) {}
+
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
+#endif
+
+  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return static_cast<void *>(pMapper.add_pointer(buf));
+  }
+
+  /// Detach previously attached buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree<false>(p, pMapper);
+  }
+
+  /// Allocating device pointer. This pointer is actually an 8 bytes host
+  /// pointer used as key to access the sycl device buffer. The reason is that
+  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
+  /// expressions. So we create a key pointer to be used in Eigen expression
+  /// construction. When we convert the Eigen construction into the sycl
+  /// construction we use this pointer as a key in our buffer_map and we make
+  /// sure that we dedicate only one buffer only for this pointer. The device
+  /// pointer would be deleted by calling deallocate function.
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+    if (align > 0) {
+      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+    }
+#endif
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    if (scratch_buffers.empty()) {
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+      ;
+    } else {
+      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
+        auto buff = pMapper.get_buffer(*it);
+        if (buff.get_size() >= num_bytes) {
+          auto ptr = *it;
+          scratch_buffers.erase(it);
+          return ptr;
+        } else {
+          ++it;
         }
+      }
+      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
     }
-  }))
 #else
-  m_queue(cl::sycl::queue(s))
+    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
 #endif
-  {}
-  // destructor
-  ~SyclDevice() { deallocate_all(); }
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return static_cast<data_t *>(data.get_virtual_pointer());
+  }
 
-  template <typename T> void deallocate(T *p) const {
-    auto it = buffer_map.find(p);
-    if (it != buffer_map.end()) {
-      buffer_map.erase(it);
-      internal::aligned_free(p);
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.insert(p);
+#else
+    TensorSycl::internal::SYCLfree(p, pMapper);
+#endif
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
+    deallocate_temp(p.get_virtual_pointer());
+  }
+
+  /// This is used to deallocate the device pointer. p is used as a key inside
+  /// the map to find the device buffer and delete it.
+  EIGEN_STRONG_INLINE void deallocate(void *p) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfree(p, pMapper);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    TensorSycl::internal::SYCLfreeAll(pMapper);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
+  }
+
+  /// The memcpyHostToDevice is used to copy the data from host to device
+  /// The destination pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
+        write_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
     }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
+      auto non_deleter = [](buffer_scalar_t const *) {};
+      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(s_ptr, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
   }
-  void deallocate_all() const {
-    std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
-    while (it!=buffer_map.end()) {
-      auto p=it->first;
-      buffer_map.erase(it);
-      internal::aligned_free(const_cast<void*>(p));
-      it=buffer_map.begin();
+
+  /// The memcpyDeviceToHost is used to copy the data from device to host.
+  /// The source pointer could be deleted before the copy happend which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const void *src, size_t n,
+      std::function<void()> callback) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
+        read_accessor;
+    if (n == 0) {
+      if (callback) callback();
+      return;
     }
-    buffer_map.clear();
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
+      auto non_deleter = [](buffer_scalar_t *) {};
+      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
+      cgh.copy(src_acc, s_ptr);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    synchronize_and_callback(e, callback);
   }
 
-  /// creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
-  ///the function then adds an entry by creating a sycl buffer for that particular pointer.
-  template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
-    return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+  /// The memcpy function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    static const auto read_mode = cl::sycl::access::mode::read;
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
+      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+      cgh.copy(src_acc, dst_acc);
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
   }
 
-  template<typename T> inline  std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
-    using Type = cl::sycl::buffer<T, 1>;
-    std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
-      [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
-    (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
-    return ret;
+  /// the memset function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    static const auto write_mode = cl::sycl::access::mode::discard_write;
+    if (n == 0) {
+      return;
+    }
+    n /= sizeof(buffer_scalar_t);
+    auto f = [&](cl::sycl::handler &cgh) {
+      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+      // The cast to uint8_t is here to match the behaviour of the standard
+      // memset. The cast to buffer_scalar_t is needed to match the type of the
+      // accessor (in case buffer_scalar_t is not uint8_t)
+      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+    async_synchronize(e);
   }
 
-  template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
-    return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
+  /// Get a range accessor to the virtual pointer's device memory. This range
+  /// accessor will allow access to the memory from the pointer to the end of
+  /// the buffer.
+  ///
+  /// NOTE: Inside a kernel the range accessor will always be indexed from the
+  /// start of the buffer, so the offset in the accessor is only used by
+  /// methods like handler::copy and will not be available inside a kernel.
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
+    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
+
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+
+    auto original_buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    const ptrdiff_t typed_offset = offset / sizeof(T);
+    eigen_assert(typed_offset >= 0);
+    const auto typed_size = original_buffer.get_size() / sizeof(T);
+    auto buffer = original_buffer.template reinterpret<
+        typename Eigen::internal::remove_const<T>::type>(
+        cl::sycl::range<1>(typed_size));
+    const ptrdiff_t size = buffer.get_count() - typed_offset;
+    eigen_assert(size >= 0);
+    typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
+                               1, AcMd, global_access, is_place_holder>
+        placeholder_accessor_t;
+    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
+    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
+                                           cl::sycl::id<1>(typed_offset)),
+                    static_cast<size_t>(typed_offset),
+                    reinterpret_cast<std::intptr_t>(start_ptr));
   }
 
-  /// allocating memory on the cpu
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
-    return internal::aligned_malloc(8);
+  /// Get a range accessor to the virtual pointer's device memory with a
+  /// specified size.
+  template <cl::sycl::access::mode AcMd, typename Index>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+                     const Index n_bytes) const {
+    static const auto global_access = cl::sycl::access::target::global_buffer;
+    eigen_assert(n_bytes >= 0);
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    auto buffer = pMapper.get_buffer(ptr);
+    const ptrdiff_t offset = pMapper.get_offset(ptr);
+    eigen_assert(offset >= 0);
+    eigen_assert(offset + n_bytes <= buffer.get_size());
+    return buffer.template get_access<AcMd, global_access>(
+        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
   }
 
-  // some runtime conditions that can be applied here
-  bool isDeviceSuitable() const { return true; }
+  /// Creation of sycl accessor for a buffer. This function first tries to find
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if
+  /// not, the function then adds an entry by creating a sycl buffer for that
+  /// particular pointer.
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr)
+        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
+            cgh);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_buffer(ptr);
+  }
+
+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    std::lock_guard<std::mutex> lock(pmapper_mutex_);
+    return pMapper.get_offset(ptr);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename Lhs,
+            typename Rhs, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+                                                  const Rhs &rhs, OutPtr outptr,
+                                                  Range thread_range,
+                                                  Index scratchSize,
+                                                  T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      lhs.bind(cgh);
+      rhs.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr,
+            typename OutPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+                                                 OutPtr &outptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      outptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+    template <typename OutScalar, typename sycl_kernel, typename InPtr,
+           typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+                                                 Range thread_range,
+                                                 Index scratchSize,
+                                                 T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      // binding the placeholder accessors to a commandgroup handler
+      inptr.bind(cgh);
+      typedef cl::sycl::accessor<OutScalar, 1,
+                                 cl::sycl::access::mode::read_write,
+                                 cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+          program().template get_kernel<sycl_kernel>(),
+#endif
+          thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+    cl::sycl::event e;
+    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+    async_synchronize(e);
+  }
+
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+#ifdef EIGEN_EXCEPTIONS
+    m_queue.wait_and_throw();
+#else
+    m_queue.wait();
+#endif
+  }
+
+
+  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
+    set_latest_event(e);
+#ifndef EIGEN_SYCL_ASYNC_EXECUTION
+    synchronize();
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                           EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                        static_cast<Index>(tileSize));
+    rng = n;
+    if (rng == 0) rng = static_cast<Index>(1);
+    GRange = rng;
+    if (tileSize > GRange)
+      tileSize = GRange;
+    else if (GRange > tileSize) {
+      Index xMode = static_cast<Index>(GRange % tileSize);
+      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[1] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
+    Index max_workgroup_Size =
+        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size =
+        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                 static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[2] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0)
+        global_range[2] += static_cast<Index>(local_range[2] - xMode);
+    }
+    pow_of_2 = static_cast<Index>(
+        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] =
+        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0)
+        global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size /
+                                        (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0)
+        global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return true;
+#else
+    return m_device_info.local_mem_type ==
+           cl::sycl::info::local_mem_type::local;
+#endif
+  }
+
+  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
+    return m_device_info.max_mem_alloc_size;
+  }
+
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return m_device_info.max_compute_units;
+  }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return m_device_info.max_work_group_size;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return m_device_info.max_work_item_sizes;
+  }
+
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return 2;
+  }
+
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return m_device_info.local_mem_size;
+  }
+
+  // This function returns the nearest power of 2 Work-group size which is <=
+  // maximum device workgroup size.
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return getPowerOfTwo(m_device_info.max_work_group_size, false);
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
-    ::memcpy(dst, src, n);
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return m_device_info.platform_name;
   }
 
-  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
-    auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
-    memcpy(host_acc.get_pointer(), src, n);
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return m_device_info.device_name;
   }
- /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
-  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
-    auto it = buffer_map.find(src);
-    if (it != buffer_map.end()) {
-      auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
-      memcpy(dst,host_acc.get_pointer(),  n);
-    } else{
-      eigen_assert("no device memory found. The memory might be destroyed before creation");
+
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return m_device_info.device_vendor;
+  }
+
+  // This function returns the nearest power of 2
+  // if roundup is true returns result>=wgsize
+  // else it return result <= wgsize
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+    if (roundUp) --wGSize;
+    wGSize |= (wGSize >> 1);
+    wGSize |= (wGSize >> 2);
+    wGSize |= (wGSize >> 4);
+    wGSize |= (wGSize >> 8);
+    wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+    wGSize |= (wGSize >> 32);
+#endif
+    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const {
+    if (!exception_caught_) {
+      synchronize();
     }
+    return !exception_caught_;
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    return latest_events_[std::this_thread::get_id()];
+#else
+    eigen_assert(false);
+    return cl::sycl::event();
+#endif
+  }
+
+  // destructor
+  ~QueueInterface() {
+    pMapper.clear();
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+    scratch_buffers.clear();
+#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
+ protected:
+  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+    std::lock_guard<std::mutex> lock(event_mutex_);
+    latest_events_[std::this_thread::get_id()] = e;
+#else
+    EIGEN_UNUSED_VARIABLE(e);
+#endif
+  }
+
+  void synchronize_and_callback(cl::sycl::event e,
+                                const std::function<void()> &callback) const {
+    set_latest_event(e);
+    if (callback) {
+      auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+        cl::sycl::event(e).wait_and_throw();
+#else
+        cl::sycl::event(e).wait();
+#endif
+        callback();
+      };
+      m_thread_pool.Schedule(std::move(callback_));
+    } else {
+#ifdef EIGEN_EXCEPTIONS
+      m_queue.wait_and_throw();
+#else
+      m_queue.wait();
+#endif
+    }
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-  return 1;
+
+  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
+    bool exception_caught = false;
+    for (const auto &e : exceptions) {
+      if (e) {
+        exception_caught = true;
+        EIGEN_THROW_X(e);
+      }
+    }
+    return exception_caught;
   }
+
+  /// class members:
+  bool exception_caught_ = false;
+
+  mutable std::mutex pmapper_mutex_;
+
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+  mutable std::mutex event_mutex_;
+  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
+#endif
+
+  /// std::map is the container used to make sure that we create only one buffer
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of
+  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
+  /// host we should manually deallocate it.
+  mutable TensorSycl::internal::PointerMapper pMapper;
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+  mutable std::unordered_set<void *> scratch_buffers;
+#endif
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  mutable cl::sycl::program m_prog;
+#endif
+
+  /// The thread pool is used to wait on events and call callbacks
+  /// asynchronously
+  mutable Eigen::ThreadPool m_thread_pool;
+
+  const TensorSycl::internal::SyclDeviceInfo m_device_info;
 };
 
+struct SyclDeviceBase {
+  /// QueueInterface is not owned. it is the caller's responsibility to destroy
+  /// it
+  const QueueInterface *m_queue_stream;
+  explicit SyclDeviceBase(const QueueInterface *queue_stream)
+      : m_queue_stream(queue_stream) {}
+  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
+    return m_queue_stream;
+  }
+};
+
+// Here is a sycl device struct which accept the sycl queue interface
+// as an input
+struct SyclDevice : public SyclDeviceBase {
+  explicit SyclDevice(const QueueInterface *queue_stream)
+      : SyclDeviceBase(queue_stream) {}
+
+  // this is the accessor used to construct the evaluator
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+  get_range_accessor(const void *ptr) const {
+    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+  }
+
+  // get sycl accessor
+  template <cl::sycl::access::mode AcMd>
+  EIGEN_STRONG_INLINE cl::sycl::accessor<
+      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
+  }
+
+  /// Accessing the created sycl device buffer for the device pointer
+  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+      const void *ptr) const {
+    return queue_stream()->get_sycl_buffer(ptr);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+                                              Index &rng, Index &GRange) const {
+    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+      cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(
+      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+      cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// allocate device memory
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+    return queue_stream()->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+    return queue_stream()->allocate_temp(num_bytes);
+  }
+
+  /// deallocate device memory
+  EIGEN_STRONG_INLINE void deallocate(void *p) const {
+    queue_stream()->deallocate(p);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  template <cl::sycl::access::mode AcMd, typename T>
+  EIGEN_STRONG_INLINE void deallocate_temp(
+      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+    queue_stream()->deallocate_temp(buffer);
+  }
+  EIGEN_STRONG_INLINE void deallocate_all() const {
+    queue_stream()->deallocate_all();
+  }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+      cl::sycl::access::mode::read_write, data_t>
+  get(data_t *data) const {
+    return queue_stream()->get(data);
+  }
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+                                        data_t>
+          data) const {
+    return queue_stream()->get(data);
+  }
+
+  /// attach existing buffer
+  EIGEN_STRONG_INLINE void *attach_buffer(
+      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+    return queue_stream()->attach_buffer(buf);
+  }
+  /// detach buffer
+  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+    queue_stream()->detach_buffer(p);
+  }
+  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+    return queue_stream()->get_offset(ptr);
+  }
+
+  // some runtime conditions that can be applied here
+  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+
+  /// memcpyHostToDevice
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(
+      Index *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
+  }
+  /// memcpyDeviceToHost
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+      void *dst, const Index *src, size_t n,
+      std::function<void()> callback = {}) const {
+    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
+  }
+  /// the memcpy function
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    queue_stream()->memcpy(dst, src, n);
+  }
+  /// the memset function
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    queue_stream()->memset(data, c, n);
+  }
+  /// returning the sycl queue
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
+    return queue_stream()->sycl_queue();
+  }
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
+    return queue_stream()->program();
+  }
+#endif
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on sycl devices.
+    return firstLevelCacheSize();
+  }
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return queue_stream()->getNumSyclMultiProcessors();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+    return queue_stream()->maxSyclThreadsPerBlock();
+  }
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+    return queue_stream()->maxWorkItemSizes();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL doesnot have such concept
+    return queue_stream()->maxSyclThreadsPerMultiProcessor();
+  }
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+    return queue_stream()->sharedMemPerBlock();
+  }
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+    return queue_stream()->getPowerOfTwo(val, roundUp);
+  }
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return queue_stream()->majorDeviceVersion();
+  }
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+    queue_stream()->synchronize();
+  }
+  EIGEN_STRONG_INLINE void async_synchronize(
+      cl::sycl::event e = cl::sycl::event()) const {
+    queue_stream()->async_synchronize(e);
+  }
+  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+    return queue_stream()->get_latest_event();
+  }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+    return queue_stream()->has_local_memory();
+  }
+  EIGEN_STRONG_INLINE long max_buffer_size() const {
+    return queue_stream()->max_buffer_size();
+  }
+  EIGEN_STRONG_INLINE std::string getPlatformName() const {
+    return queue_stream()->getPlatformName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceName() const {
+    return queue_stream()->getDeviceName();
+  }
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+    return queue_stream()->getDeviceVendor();
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+        var...);
+  }
+};
 }  // end namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 069680a11..e524b535a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,67 +12,6 @@
 
 namespace Eigen {
 
-// Use the SimpleThreadPool by default. We'll switch to the new non blocking
-// thread pool later.
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-
-
-// Barrier is an object that allows one or more threads to wait until
-// Notify has been called a specified number of times.
-class Barrier {
- public:
-  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
-    eigen_assert(((count << 1) >> 1) == count);
-  }
-  ~Barrier() {
-    eigen_assert((state_>>1) == 0);
-  }
-
-  void Notify() {
-    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
-    if (v != 1) {
-      eigen_assert(((v + 2) & ~1) != 0);
-      return;  // either count has not dropped to 0, or waiter is not waiting
-    }
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void Wait() {
-    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
-    if ((v >> 1) == 0) return;
-    std::unique_lock<std::mutex> l(mu_);
-    while (!notified_) {
-      cv_.wait(l);
-    }
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  std::atomic<unsigned int> state_;  // low bit is waiter flag
-  bool notified_;
-};
-
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object,
-// but only one caller must call Notify() on the object.
-struct Notification : Barrier {
-  Notification() : Barrier(1) {};
-};
-
-
 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
 template <typename Function, typename... Args> struct FunctionWrapperWithNotification
@@ -102,22 +41,75 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
   }
 }
 
+// An abstract interface to a device specific memory allocator.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
 
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+      : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
+    return allocator_ ? allocator_->allocate(num_bytes)
+        : internal::aligned_malloc(num_bytes);
   }
 
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
+    if (allocator_) {
+      allocator_->deallocate(buffer);
+    } else {
+      internal::aligned_free(buffer);
+    }
+  }
+
+    EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+    return allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+    deallocate(buffer);
+  }
+
+  template<typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
     ::memcpy(dst, src, n);
+#else
+    // TODO(rmlarsen): Align blocks on cache lines.
+    // We have observed that going beyond 4 threads usually just wastes
+    // CPU cycles due to the threads competing for memory bandwidth, so we
+    // statically schedule at most 4 block copies here.
+    const size_t kMinBlockSize = 32768;
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    if (n <= kMinBlockSize || num_threads < 2) {
+      ::memcpy(dst, src, n);
+    } else {
+      const char* src_ptr = static_cast<const char*>(src);
+      char* dst_ptr = static_cast<char*>(dst);
+      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+      Barrier barrier(static_cast<int>(num_threads - 1));
+      // Launch the last 3 blocks on worker threads.
+      for (size_t i = 1; i < num_threads; ++i) {
+        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
+                   numext::mini(blocksize, n - (i * blocksize)));
+        });
+      }
+      // Launch the first block on the main thread.
+      ::memcpy(dst_ptr, src_ptr, blocksize);
+      barrier.Wait();
+    }
+#endif
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
@@ -134,6 +126,12 @@ struct ThreadPoolDevice {
     return num_threads_;
   }
 
+  // Number of theads available in the underlying thread pool. This number can
+  // be different from the value returned by numThreads().
+  EIGEN_STRONG_INLINE int numThreadsInPool() const {
+    return pool_->NumThreads();
+  }
+
   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     return l1CacheSize();
   }
@@ -149,23 +147,31 @@ struct ThreadPoolDevice {
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
+                                            Args&&... args) const {
     Notification* n = new Notification();
-    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
+                  std::move(f), args...));
     return n;
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
-                                                Function&& f,
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
                                                 Args&&... args) const {
-    pool_->Schedule(std::bind(
-        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
+    pool_->Schedule(
+        std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
+                  std::move(f), args...));
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    pool_->Schedule(std::bind(f, args...));
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
+                                                 Args&&... args) const {
+    if (sizeof...(args) > 0) {
+      pool_->Schedule(std::bind(std::move(f), args...));
+    } else {
+      pool_->Schedule(std::move(f));
+    }
   }
 
   // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
@@ -174,45 +180,193 @@ struct ThreadPoolDevice {
     return pool_->CurrentThreadId();
   }
 
-  // parallelFor executes f with [0, n) arguments in parallel and waits for
-  // completion. F accepts a half-open interval [first, last).
-  // Block size is choosen based on the iteration cost and resulting parallel
+  // WARNING: This function is synchronous and will block the calling thread.
+  //
+  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+  // waits for completion. F accepts a half-open interval [first, last). Block
+  // size is chosen based on the iteration cost and resulting parallel
   // efficiency. If block_align is not nullptr, it is called to round up the
   // block size.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<Index(Index)> block_align,
                    std::function<void(Index, Index)> f) const {
-    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    if (EIGEN_PREDICT_FALSE(n <= 0)){
+      return;
+    // Compute small problems directly in the caller thread.
+    } else if (n == 1 || numThreads() == 1 ||
+               CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block.count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
+                                                  Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+      // Single block or less, execute directly.
+      f(firstIdx, lastIdx);
+      barrier.Notify();
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      handleRange(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+    }
+
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+  // WARNING: This function is asynchronous and will not block the calling thread.
+  //
+  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+  // without waiting for completion. When the last block finished, it will call
+  // 'done' callback. F accepts a half-open interval [first, last). Block size
+  // is chosen based on the iteration cost and resulting parallel efficiency. If
+  // block_align is not nullptr, it is called to round up the block size.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<Index(Index)> block_align,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    // Compute small problems directly in the caller thread.
     if (n <= 1 || numThreads() == 1 ||
         CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
+      done();
       return;
     }
 
-    // Calculate block size based on (1) the iteration cost and (2) parallel
-    // efficiency. We want blocks to be not too small to mitigate
-    // parallelization overheads; not too large to mitigate tail
-    // effect and potential load imbalance and we also want number
-    // of blocks to be evenly dividable across threads.
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    ParallelForAsyncContext* const ctx =
+        new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule(
+            [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+
+      // Single block or less, execute directly.
+      ctx->f(firstIdx, lastIdx);
+
+      // Delete async context if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) delete ctx;
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      ctx->handle_range(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    }
+  }
+
+  // Convenience wrapper for parallelForAsync that does not align blocks.
+  void parallelForAsync(Index n, const TensorOpCost& cost,
+                        std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
+  }
+
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
+  // Allocator accessor.
+  Allocator* allocator() const { return allocator_; }
+
+ private:
+  typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+  // For parallelForAsync we must keep passed in closures on the heap, and
+  // delete them only after `done` callback finished.
+  struct ParallelForAsyncContext {
+    ParallelForAsyncContext(Index block_count,
+                            std::function<void(Index, Index)> block_f,
+                            std::function<void()> done_callback)
+        : count(block_count),
+          f(std::move(block_f)),
+          done(std::move(done_callback)) {}
+    ~ParallelForAsyncContext() { done(); }
+
+    std::atomic<Index> count;
+    std::function<void(Index, Index)> f;
+    std::function<void()> done;
+
+    std::function<void(Index, Index)> handle_range;
+  };
+
+  struct ParallelForBlock {
+    Index size;   // block size
+    Index count;  // number of blocks
+  };
+
+  // Calculates block size based on (1) the iteration cost and (2) parallel
+  // efficiency. We want blocks to be not too small to mitigate parallelization
+  // overheads; not too large to mitigate tail effect and potential load
+  // imbalance and we also want number of blocks to be evenly dividable across
+  // threads.
+  ParallelForBlock CalculateParallelForBlock(
+      const Index n, const TensorOpCost& cost,
+      std::function<Index(Index)> block_align) const {
+    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(
+               divup<Index>(n, max_oversharding_factor * numThreads()),
+               block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
 
-    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
-    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
-    const Index max_block_size =
-        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
       block_size = numext::mini(n, new_block_size);
     }
+
     Index block_count = divup(n, block_size);
+
     // Calculate parallel efficiency as fraction of total CPU time used for
     // computations:
     double max_efficiency =
         static_cast<double>(block_count) /
         (divup<int>(block_count, numThreads()) * numThreads());
+
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
-    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+    for (Index prev_block_count = block_count;
+         max_efficiency < 1.0 && prev_block_count > 1;) {
       // This is the next block size that divides size into a smaller number
       // of blocks than the current block_size.
       Index coarser_block_size = divup(n, prev_block_count - 1);
@@ -241,36 +395,12 @@ struct ThreadPoolDevice {
       }
     }
 
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    Barrier barrier(static_cast<unsigned int>(block_count));
-    std::function<void(Index, Index)> handleRange;
-    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
-      if (last - first <= block_size) {
-        // Single block or less, execute directly.
-        f(first, last);
-        barrier.Notify();
-        return;
-      }
-      // Split into halves and submit to the pool.
-      Index mid = first + divup((last - first) / 2, block_size) * block_size;
-      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
-      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
-    };
-    handleRange(0, n);
-    barrier.Wait();
-  }
-
-  // Convenience wrapper for parallelFor that does not align blocks.
-  void parallelFor(Index n, const TensorOpCost& cost,
-                   std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, nullptr, std::move(f));
+    return {block_size, block_count};
   }
 
- private:
   ThreadPoolInterface* pool_;
   int num_threads_;
+  Allocator* allocator_;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index b24cdebf1..f0f1e832a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -32,16 +32,16 @@ namespace Eigen {
 // Boilerplate code
 namespace internal {
 
-template<std::size_t n, typename Dimension> struct dget {
-  static const std::size_t value = get<n, Dimension>::value;
+template<std::ptrdiff_t n, typename Dimension> struct dget {
+  static const std::ptrdiff_t value = get<n, Dimension>::value;
 };
 
 
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const& indices,
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
                           const Dimensions& dimensions)
   {
     return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
@@ -50,21 +50,21 @@ struct fixed_size_tensor_index_linearization_helper
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
+  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
   {
     return 0;
   }
 };
 
-template<typename Index, std::size_t n>
+template<typename Index, std::ptrdiff_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index index,
+  static EIGEN_STRONG_INLINE Index run(const Index index,
                           const Dimensions& dimensions)
   {
     const Index mult = (index == n-1) ? 1 : 0;
@@ -77,7 +77,7 @@ template<typename Index>
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static inline Index run(const Index,
+  static EIGEN_STRONG_INLINE Index run(const Index,
                           const Dimensions&)
   {
     return 0;
@@ -90,9 +90,11 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
 // Fixed size
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices>
-struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
+struct Sizes {
   typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  const Base t = Base();
   static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+  static const ptrdiff_t count = Base::count;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
     return Base::count;
@@ -119,17 +121,17 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
     return *this;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
   }
 };
 
@@ -142,25 +144,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indi
 
 #else
 
-template <std::size_t n>
+template <std::ptrdiff_t n>
 struct non_zero_size {
-  typedef internal::type2val<std::size_t, n> type;
+  typedef internal::type2val<std::ptrdiff_t, n> type;
 };
 template <>
 struct non_zero_size<0> {
   typedef internal::null_type type;
 };
 
-template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
+template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
   typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
-  static const size_t count = Base::count;
-  static const std::size_t total_size = internal::arg_prod<Base>::value;
+  static const std::ptrdiff_t count = Base::count;
+  static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
     return count;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
     return internal::arg_prod<Base>::value;
   }
 
@@ -176,7 +178,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
-  explicit Sizes(std::initializer_list<std::size_t>) {
+  explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
     // todo: add assertion
   }
 #else
@@ -192,7 +194,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex operator[] (const int index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const {
     switch (index) {
       case 0:
         return internal::get<0, Base>::value;
@@ -206,23 +208,23 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
         return internal::get<4, Base>::value;
       default:
         eigen_assert(false && "index overflow");
-        return static_cast<DenseIndex>(-1);
+        return static_cast<Index>(-1);
     }
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
     return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
     return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
   }
 };
 
 namespace internal {
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
   return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
 }
@@ -231,7 +233,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2,
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -243,7 +245,7 @@ struct tensor_index_linearization_helper
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -262,7 +264,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   typedef array<DenseIndex, NumDims> Base;
   static const int count = NumDims;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
     return NumDims;
   }
 
@@ -282,6 +284,57 @@ struct DSizes : array<DenseIndex, NumDims> {
     (*this)[0] = i0;
   }
 
+  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  // Enable DSizes index type promotion only if we are promoting to the
+  // larger type, e.g. allow to promote dimensions of type int to long.
+  template<typename OtherIndex>
+  EIGEN_DEVICE_FUNC
+  explicit DSizes(const array<OtherIndex, NumDims>& other,
+                  // Default template parameters require c++11.
+                  typename internal::enable_if<
+                     internal::is_same<
+                         DenseIndex,
+                         typename internal::promote_index_type<
+                             DenseIndex,
+                             OtherIndex
+                         >::type
+                     >::value, void*>::type = 0) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = static_cast<DenseIndex>(other[i]);
+    }
+  }
+
+#ifdef EIGEN_HAS_INDEX_LIST
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC
+  explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    for (int i = 0; i < dimensions.count; ++i) {
+      (*this)[i] = dimensions[i];
+    }
+  }
+#endif
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#else
+  template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+    for (int i = 0 ; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+#endif
+
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
@@ -330,12 +383,21 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 };
 
-
-
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os,
+                         const DSizes<IndexType, NumDims>& dims) {
+  os << "[";
+  for (int i = 0; i < NumDims; ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
 struct tensor_vsize_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -347,7 +409,7 @@ struct tensor_vsize_index_linearization_helper
   }
 };
 
-template<typename Index, std::size_t NumIndices, bool RowMajor>
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
 struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -362,10 +424,10 @@ struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 namespace internal {
 
 template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
-  static const size_t value = NumDims;
+  static const ptrdiff_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
@@ -375,42 +437,42 @@ template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...
 static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
 template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
-  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
+  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
 }
 template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
   eigen_assert(false && "should never be called");
   return -1;
 }
 #else
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
-  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
+template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
   return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
 }
 
 #endif
 
 
-template <typename Dims1, typename Dims2, size_t n, size_t m>
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
 struct sizes_match_below_dim {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
     return false;
   }
 };
-template <typename Dims1, typename Dims2, size_t n>
+template <typename Dims1, typename Dims2, ptrdiff_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
         sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
     return true;
   }
 };
@@ -419,7 +481,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
 
 
 template <typename Dims1, typename Dims2>
-EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
   return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 06987132b..a48d035f5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -32,6 +32,7 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename MakePointer_<Scalar>::Type PointerType;
 
   enum {
     Flags = 0
@@ -41,6 +42,8 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
     // Intermediate typedef to workaround MSVC issue.
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
+
+
   };
 };
 
@@ -73,6 +76,8 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>,
   typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
 
+  static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
       : m_xpr(expr), m_buffer(buffer) {}
 
@@ -98,38 +103,60 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   typedef typename XprType::Index Index;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = true
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = true,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = true
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device),
-          m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
-  { }
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
 
-  // Used for accessor extraction in SYCL Managed TensorMap:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
-    return m_op;
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef internal::TensorBlockAssignment<
+      CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
+      TensorBlockAssignment;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
+
+
+  EIGEN_STRONG_INLINE ~TensorEvaluator() {
   }
 
-  typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
+
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
     EIGEN_UNUSED_VARIABLE(scalar);
     eigen_assert(scalar == NULL);
     return m_impl.evalSubExprsIfNeeded(m_buffer);
   }
 
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType scalar, EvalSubExprsCallback done) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
+  }
+#endif
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
@@ -137,7 +164,34 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    // Add `m_buffer` as destination buffer to the block descriptor.
+    desc.template AddDestinationBuffer<Layout>(
+        /*dst_base=*/m_buffer + desc.offset(),
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
+
+    ArgTensorBlock block =
+        m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
+
+    // If block was evaluated into a destination buffer, there is no need to do
+    // an assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
+              m_buffer, desc.offset()),
+          block.expr());
+    }
+    block.cleanup();
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -159,19 +213,20 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
         TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
   ArgType expression() const { return m_expression; }
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_buffer.bind(cgh);
+  }
+  #endif
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  /// added for sycl in order to construct the buffer from the sycl device
-  const Device& device() const{return m_device;}
 
  private:
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
-  DevicePointer m_buffer;
-  const XprType& m_op;
+  EvaluatorPointerType m_buffer;
   const ArgType m_expression;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 834ce07df..3aff7fa01 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -32,44 +32,72 @@ struct TensorEvaluator
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef Derived XprType;
+  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
+  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                                internal::traits<Derived>::NumDimensions : 0;
 
   enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords > 0,
-    RawAccess = true
+    IsAligned          = Derived::IsAligned,
+    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+    PreferBlockAccess  = false,
+    Layout             = Derived::Layout,
+    CoordAccess        = NumCoords > 0,
+    RawAccess          = true
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
+        m_dims(m.dimensions()),
+        m_device(device)
   { }
 
-  // Used for accessor extraction in SYCL Managed TensorMap:
-  const Derived& derived() const { return m_impl; }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
-    if (dest) {
-      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
+      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    eigen_assert(m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+    eigen_assert(m_data != NULL);
     return m_data[index];
   }
 
@@ -79,6 +107,18 @@ struct TensorEvaluator
     return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
   }
 
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+  {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
@@ -86,7 +126,7 @@ struct TensorEvaluator
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -94,8 +134,9 @@ struct TensorEvaluator
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
-    eigen_assert(m_data);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
+  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+    eigen_assert(m_data != NULL);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -105,19 +146,50 @@ struct TensorEvaluator
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
   }
 
-  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  template<typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(m_data != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
+                                            Index>
+        TensorBlockAssign;
 
-  /// required by sycl in order to construct sycl buffer from raw pointer
-  const Device& device() const{return m_device;}
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(m_dims), m_data,
+                                  desc.offset()),
+        block.expr());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
  protected:
-  typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
+  EvaluatorPointerType m_data;
   Dimensions m_dims;
-  const Device& m_device;
-  const Derived& m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 namespace {
@@ -126,7 +198,7 @@ T loadConstant(const T* address) {
   return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float loadConstant(const float* address) {
   return __ldg(address);
@@ -140,6 +212,13 @@ Eigen::half loadConstant(const Eigen::half* address) {
   return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
 }
 #endif
+#ifdef EIGEN_USE_SYCL
+// overload of load constant should be implemented here based on range access
+template <cl::sycl::access::mode AcMd, typename T>
+T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
+  return *address;
+}
+#endif
 }
 
 
@@ -152,40 +231,64 @@ struct TensorEvaluator<const Derived, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef const Derived XprType;
+  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+  typedef StorageMemory<const Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                                internal::traits<Derived>::NumDimensions : 0;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = Derived::IsAligned,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    Layout = Derived::Layout,
-    CoordAccess = NumCoords > 0,
-    RawAccess = true
+    IsAligned         = Derived::IsAligned,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
+    PreferBlockAccess = false,
+    Layout            = Derived::Layout,
+    CoordAccess       = NumCoords > 0,
+    RawAccess         = true
   };
 
-  // Used for accessor extraction in SYCL Managed TensorMap:
-  const Derived& derived() const { return m_impl; }
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
-      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
+      m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     return loadConstant(m_data+index);
   }
 
@@ -195,8 +298,20 @@ struct TensorEvaluator<const Derived, Device>
     return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
   }
 
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+  {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data);
+    eigen_assert(m_data != NULL);
     const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
                         : m_dims.IndexOfRowMajor(coords);
     return loadConstant(m_data+index);
@@ -204,19 +319,32 @@ struct TensorEvaluator<const Derived, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
   }
 
-  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
 
-  /// added for sycl in order to construct the buffer from the sycl device
-  const Device& device() const{return m_device;}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
 
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
  protected:
-  typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
+  EvaluatorPointerType m_data;
   Dimensions m_dims;
-  const Device& m_device;
-  const Derived& m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
@@ -229,15 +357,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
   typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
 
-  enum {
-    IsAligned = true,
-    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
       : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
@@ -246,13 +365,42 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+    #ifdef EIGEN_USE_SYCL
+    &&  (PacketType<CoeffReturnType, Device>::size >1)
+    #endif
+    ,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    done(true);
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
@@ -268,16 +416,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        internal::unpacket_traits<PacketReturnType>::size);
+                        PacketType<CoeffReturnType, Device>::size);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
-  /// required by sycl in order to extract the accessor
-  NullaryOp functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_argImpl.bind(cgh);
+  }
+#endif
 
  private:
   const NullaryOp m_functor;
@@ -295,32 +444,60 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess       = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+                         int(internal::functor_traits<UnaryOp>::PacketAccess),
+    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout             = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess        = false,  // to be implemented
+    RawAccess          = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_device(device),
+      m_functor(op.functor()),
       m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_argImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_argImpl.cleanup();
   }
 
@@ -341,15 +518,31 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
         TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.getResourceRequirements().addCostPerCoeff(
+        {0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
+  }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
-  /// added for sycl in order to construct the buffer from sycl device
-  UnaryOp functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
+    m_argImpl.bind(cgh);
+  }
+#endif
 
 
  private:
+  const Device EIGEN_DEVICE_REF m_device;
   const UnaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
 };
@@ -363,16 +556,23 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
-                   internal::functor_traits<BinaryOp>::PacketAccess,
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+                        int(internal::functor_traits<BinaryOp>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
+  TensorEvaluator(const XprType& op, const Device& device)
+    : m_device(device),
+      m_functor(op.functor()),
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
@@ -384,8 +584,27 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static const int NumDims = internal::array_size<
+      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
+      LeftTensorBlock;
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+      RightTensorBlock;
+
+  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
+                                           RightTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -393,12 +612,25 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_leftImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): Evaluate two expression in parallel?
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
+                                            [done](bool) { done(true); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
@@ -421,15 +653,34 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
-  /// required by sycl in order to extract the accessor
-  BinaryOp functor() const { return m_functor; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return internal::TensorBlockResourceRequirements::merge(
+               m_leftImpl.getResourceRequirements(),
+               m_rightImpl.getResourceRequirements())
+        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    desc.DropDestinationBuffer();
+    return TensorBlock(m_leftImpl.block(desc, scratch),
+                         m_rightImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_leftImpl.bind(cgh);
+    m_rightImpl.bind(cgh);
+  }
+  #endif
  private:
+  const Device EIGEN_DEVICE_REF m_device;
   const BinaryOp m_functor;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -444,14 +695,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
 
   enum {
     IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
-                   internal::functor_traits<TernaryOp>::PacketAccess,
-    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    PacketAccess      = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
+                        TensorEvaluator<Arg2Type, Device>::PacketAccess &&
+                        TensorEvaluator<Arg3Type, Device>::PacketAccess &&
+                        internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
       m_arg1Impl(op.arg1Expression(), device),
       m_arg2Impl(op.arg2Expression(), device),
@@ -479,8 +736,14 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -488,13 +751,13 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     return m_arg1Impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_arg1Impl.evalSubExprsIfNeeded(NULL);
     m_arg2Impl.evalSubExprsIfNeeded(NULL);
     m_arg3Impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_arg1Impl.cleanup();
     m_arg2Impl.cleanup();
     m_arg3Impl.cleanup();
@@ -521,14 +784,16 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_arg1Impl.bind(cgh);
+    m_arg2Impl.bind(cgh);
+    m_arg3Impl.bind(cgh);
+  }
+#endif
 
  private:
   const TernaryOp m_functor;
@@ -547,15 +812,23 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                   internal::packet_traits<Scalar>::HasBlend,
-    Layout = TensorEvaluator<IfArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = TensorEvaluator<ThenArgType, Device>::IsAligned &
+                        TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                        PacketType<Scalar, Device>::HasBlend,
+    BlockAccess       = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+                        TensorEvaluator<ElseArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+    Layout            = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_condImpl(op.ifExpression(), device),
       m_thenImpl(op.thenExpression(), device),
       m_elseImpl(op.elseExpression(), device)
@@ -569,8 +842,42 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Index Index;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
   typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static const int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+    typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
+      IfArgTensorBlock;
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
+      ThenArgTensorBlock;
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
+      ElseArgTensorBlock;
+
+  struct TensorSelectOpBlockFactory {
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    struct XprType {
+      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+    };
+
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
+        const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
+      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+    }
+  };
+
+  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
+                                           IfArgTensorBlock, ThenArgTensorBlock,
+                                           ElseArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -578,13 +885,26 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_condImpl.evalSubExprsIfNeeded(NULL);
     m_thenImpl.evalSubExprsIfNeeded(NULL);
     m_elseImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+      m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+        m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_condImpl.cleanup();
     m_thenImpl.cleanup();
     m_elseImpl.cleanup();
@@ -597,13 +917,15 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
-    internal::Selector<PacketSize> select;
-    for (Index i = 0; i < PacketSize; ++i) {
-      select.select[i] = m_condImpl.coeff(index+i);
-    }
-    return internal::pblend(select,
-                            m_thenImpl.template packet<LoadMode>(index),
-                            m_elseImpl.template packet<LoadMode>(index));
+     internal::Selector<PacketSize> select;
+     EIGEN_UNROLL_LOOP
+     for (Index i = 0; i < PacketSize; ++i) {
+       select.select[i] = m_condImpl.coeff(index+i);
+     }
+     return internal::pblend(select,
+                             m_thenImpl.template packet<LoadMode>(index),
+                             m_elseImpl.template packet<LoadMode>(index));
+
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -613,14 +935,42 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    auto then_req = m_thenImpl.getResourceRequirements();
+    auto else_req = m_elseImpl.getResourceRequirements();
+
+    auto merged_req =
+        internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+    merged_req.cost_per_coeff =
+        then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
+    return internal::TensorBlockResourceRequirements::merge(
+        m_condImpl.getResourceRequirements(), merged_req);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // It's unsafe to pass destination buffer to underlying expressions, because
+    // output might be aliased with one of the inputs.
+    desc.DropDestinationBuffer();
+
+    return TensorBlock(
+        m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+        m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
+  }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_condImpl.bind(cgh);
+    m_thenImpl.bind(cgh);
+    m_elseImpl.bind(cgh);
+  }
+#endif
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
   TensorEvaluator<ThenArgType, Device> m_thenImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index f01d77c0a..c52fb77dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,31 +12,94 @@
 
 namespace Eigen {
 
-/** \class TensorExecutor
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor executor class.
-  *
-  * This class is responsible for launch the evaluation of the expression on
-  * the specified computing device.
-  */
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tiling       can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
 namespace internal {
 
-// Default strategy: the expression is evaluated with a single cpu thread.
-template<typename Expression, typename Device, bool Vectorizable>
-class TensorExecutor
-{
+/**
+ * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
+ * expensive. If expression has at least one broadcast op in it, and it supports
+ * block based evaluation, we always prefer it, even for the small tensors. For
+ * all other tileable ops, block evaluation overhead for small tensors (fits
+ * into L1) is too large, and we fallback on vectorized evaluation.
+ */
+
+// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
+
+template<typename Expression>
+struct ExpressionHasTensorBroadcastingOp {
+  enum { value = false };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorAssignOp<LhsXprType, RhsXprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
+};
+
+template<typename UnaryOp, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorCwiseUnaryOp<UnaryOp, XprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  enum {
+    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
+        ExpressionHasTensorBroadcastingOp<RhsXprType>::value
+  };
+};
+
+template<typename Broadcast, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+    const TensorBroadcastingOp<Broadcast, XprType> > {
+  enum { value = true };
+};
+
+// -------------------------------------------------------------------------- //
+
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling>
+class TensorExecutor {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
+
+  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
+  // violation. If this template is instantiated with a non-default device, it
+  // means that this header file was included without defining
+  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
+  static_assert(std::is_same<Device, DefaultDevice>::value,
+                "Default executor instantiated with non-default device. "
+                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
+                "EIGEN_USE_SYCL before including Eigen headers.");
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const Device& device = Device())
-  {
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Device& device = Device()) {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      for (Index i = 0; i < size; ++i) {
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -44,35 +107,48 @@ class TensorExecutor
   }
 };
 
-
-template<typename Expression>
-class TensorExecutor<Expression, DefaultDevice, true>
-{
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor {};
+
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+                     /*Tiling=*/TiledEvaluation::Off> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
+
   EIGEN_DEVICE_FUNC
-  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
-  {
+  static EIGEN_STRONG_INLINE void run(
+      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<
+          Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      const StorageIndex UnrolledSize =
+          (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const Index VectorizedSize = (size / PacketSize) * PacketSize;
-      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (Index i = VectorizedSize; i < size; ++i) {
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -80,55 +156,162 @@ class TensorExecutor<Expression, DefaultDevice, true>
   }
 };
 
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
+        TensorBlockMapper;
+
+    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
+        TensorBlockDesc;
+    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+        TensorBlockScratch;
+
+    Evaluator evaluator(expr, device);
+
+    // TODO(ezhulenev): Do not use tiling for small tensors?
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
 
+    if (needs_assign) {
+      // Query expression tree for desired block size/shape.
+      const TensorBlockResourceRequirements requirements =
+          evaluator.getResourceRequirements();
 
-// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+      const TensorBlockMapper block_mapper(
+          typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
+          requirements);
+
+      // Share scratch memory allocator between all blocks.
+      TensorBlockScratch scratch(device);
+
+      const StorageIndex total_block_count = block_mapper.blockCount();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
+        evaluator.evalBlock(desc, scratch);
+        scratch.reset();
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ *     pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ *     the ThreadPoolDevice managed thread pool, and will return immediately.
+ *     It will call 'done' callback after all tasks are finished.
+ */
 #ifdef EIGEN_USE_THREADS
-template <typename Evaluator, typename Index, bool Vectorizable>
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+  TensorExecutorTilingContext() = default;
+  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
+                              const TensorOpCost& b_cost, size_t b_aligned_size)
+      : block_mapper(b_mapper),
+        cost(b_cost),
+        aligned_blocksize(b_aligned_size) {}
+
+  TensorBlockMapper block_mapper;  // navigate through blocks
+  TensorOpCost cost;               // cost of computing a single block
+  size_t aligned_blocksize;        // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
+    const Evaluator& evaluator) {
+  // Query expression tree for desired block size/shape.
+  TensorBlockResourceRequirements requirements =
+      evaluator.getResourceRequirements();
+
+  // Update target block size based on cost model.
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+      1, requirements.cost_per_coeff);
+  requirements.size = static_cast<size_t>(1.0 / taskSize);
+
+  TensorBlockMapper block_mapper(
+      typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
+      requirements);
+
+  size_t block_size = block_mapper.blockTotalSize();
+  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+  const size_t aligned_blocksize =
+      align *
+      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+
+  return {block_mapper, requirements.cost_per_coeff * block_size,
+          aligned_blocksize};
+}
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+                  const StorageIndex lastIdx) {
     Evaluator evaluator = *evaluator_in;
-    eigen_assert(last >= first);
-    for (Index i = first; i < last; ++i) {
+    eigen_assert(lastIdx >= firstIdx);
+    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
-    return size;
-  }
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
 };
 
-template <typename Evaluator, typename Index>
-struct EvalRange<Evaluator, Index, true> {
-  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static const int PacketSize =
+      unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+                  const StorageIndex lastIdx) {
     Evaluator evaluator = *evaluator_in;
-    eigen_assert(last >= first);
-    Index i = first;
-    if (last - first >= PacketSize) {
-      eigen_assert(first % PacketSize == 0);
-      Index last_chunk_offset = last - 4 * PacketSize;
-      // Give the compiler a strong hint to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive the compiler should not
+    eigen_assert(lastIdx >= firstIdx);
+    StorageIndex i = firstIdx;
+    if (lastIdx - firstIdx >= PacketSize) {
+      eigen_assert(firstIdx % PacketSize == 0);
+      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        for (Index j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      last_chunk_offset = last - PacketSize;
+      last_chunk_offset = lastIdx - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
-    for (; i < last; ++i) {
+    for (; i < lastIdx; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static Index alignBlockSize(Index size) {
+  static StorageIndex alignBlockSize(StorageIndex size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -138,144 +321,376 @@ struct EvalRange<Evaluator, Index, true> {
   }
 };
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
  public:
-  typedef typename Expression::Index Index;
-  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
-  {
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                         const ThreadPoolDevice& device) {
     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
     Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign)
-    {
-      const Index size = array_prod(evaluator.dimensions());
-#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
-                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
-                         [&evaluator](Index first, Index last) {
-                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         EvalRange::alignBlockSize,
+                         [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
+                           EvalRange::run(&evaluator, firstIdx, lastIdx);
                          });
-#else
-      size_t num_threads = device.numThreads();
-      if (num_threads > 1) {
-        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-            size, evaluator.costPerCoeff(Vectorizable), num_threads);
-      }
-      if (num_threads == 1) {
-        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
-      } else {
-        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
-        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
-        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
-        const Index numblocks = size / blocksize;
-
-        Barrier barrier(numblocks);
-        for (int i = 0; i < numblocks; ++i) {
-          device.enqueue_with_barrier(
-              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
-              &evaluator, i * blocksize, (i + 1) * blocksize);
-        }
-        if (numblocks * blocksize < size) {
-          EvalRange<Evaluator, Index, Vectorizable>::run(
-              &evaluator, numblocks * blocksize, size);
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+      TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const ThreadPoolDevice& device) {
+    Evaluator evaluator(expr, device);
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+                                                   Vectorizable>(evaluator);
+
+      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+                                                       IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+             ++block_idx) {
+          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
+          evaluator.evalBlock(desc, scratch);
+          scratch.reset();
         }
-        barrier.Wait();
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(device);
+        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
+        evaluator.evalBlock(desc, scratch);
+      } else {
+        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
+                           eval_block);
       }
-#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
     }
     evaluator.cleanup();
   }
 };
-#endif  // EIGEN_USE_THREADS
 
+template <typename Expression, typename DoneCallback, bool Vectorizable,
+          TiledEvaluation Tiling>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      device.parallelForAsync(
+          size, ctx->evaluator.costPerCoeff(Vectorizable),
+          EvalRange::alignBlockSize,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
+            EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
+          },
+          [ctx]() { delete ctx; });
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               DoneCallback done)
+        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    Evaluator evaluator;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+                          Vectorizable, /*Tileable*/ TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+  static const int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+      TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+                                           const ThreadPoolDevice& device,
+                                           DoneCallback done) {
+
+    TensorAsyncExecutorContext* const ctx =
+        new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      ctx->tiling = internal::GetTensorExecutorTilingContext<
+          Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
+
+      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(ctx->device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+             ++block_idx) {
+          TensorBlockDesc desc =
+              ctx->tiling.block_mapper.blockDescriptor(block_idx);
+          ctx->evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (ctx->tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(ctx->device);
+        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
+        ctx->evaluator.evalBlock(desc, scratch);
+        delete ctx;
+      } else {
+        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
+                                     ctx->tiling.cost, eval_block,
+                                     [ctx]() { delete ctx; });
+      }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr,
+                               const ThreadPoolDevice& thread_pool,
+                               DoneCallback done)
+        : device(thread_pool),
+          evaluator(expr, thread_pool),
+          on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    const ThreadPoolDevice& device;
+    Evaluator evaluator;
+    TilingContext tiling;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+#endif  // EIGEN_USE_THREADS
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
  public:
-  typedef typename Expression::Index Index;
+  typedef typename Expression::Index StorageIndex;
   static void run(const Expression& expr, const GpuDevice& device);
 };
 
-
-#if defined(__CUDACC__)
-template <typename Evaluator, typename Index, bool Vectorizable>
+#if defined(EIGEN_GPUCC)
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
 struct EigenMetaKernelEval {
-  static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    for (Index i = first; i < last; i += step_size) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
-struct EigenMetaKernelEval<Evaluator, Index, true> {
-  static __device__ EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, Index first, Index last, Index step_size) {
-    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const Index vectorized_size = (last / PacketSize) * PacketSize;
-    const Index vectorized_step_size = step_size * PacketSize;
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
 
     // Use the vector path
-    for (Index i = first * PacketSize; i < vectorized_size;
+    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for (Index i = vectorized_size + first; i < last; i += step_size) {
+    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename Index>
+template <typename Evaluator, typename StorageIndex>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, Index size) {
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
 
-  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index step_size = blockDim.x * gridDim.x;
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
 
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
-template <typename Expression, bool Vectorizable>
-inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
   if (needs_assign) {
-    const int block_size = device.maxCudaThreadsPerBlock();
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const Index size = array_prod(evaluator.dimensions());
+
+    const int block_size = device.maxGpuThreadsPerBlock();
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const StorageIndex size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
-    LAUNCH_CUDA_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+    LAUNCH_GPU_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
 }
 
-#endif  // __CUDACC__
+#endif  // EIGEN_GPUCC
 #endif  // EIGEN_USE_GPU
 
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
 
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, SyclDevice, Vectorizable> {
-public:
-  static inline void run(const Expression &expr, const SyclDevice &device) {
-    // call TensorSYCL module
-    TensorSycl::run(expr, device);
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
+  typedef typename Evaluator::Index Index;
+  Evaluator evaluator;
+  const Index range;
+  template <typename Scratch>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+      const Scratch, Evaluator evaluator_, const Index range_)
+      : evaluator(evaluator_), range(range_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+      cl::sycl::nd_item<1> itemID) {
+    compute(itemID);
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    Index total_threads = itemID.get_global_range(0);
+
+    for (Index i = gId; i < range; i += total_threads) {
+      evaluator.evalScalar(i);
+    }
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+  compute(const cl::sycl::nd_item<1>& itemID) {
+    const Index vectorizedRange =
+        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    const Index start = Evaluator::PacketSize * gId;
+    for (Index i = start; i < vectorizedRange; i += step) {
+      evaluator.evalPacket(i);
+    }
+    gId += vectorizedRange;
+    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index Index;
+  static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                      const Eigen::SyclDevice& dev) {
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+    Evaluator evaluator(expr, dev);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      Index range, GRange, tileSize;
+      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+      total_size = (total_size == 0) ? 1 : total_size;
+      const int PacketSize =
+          Eigen::PacketType<typename Evaluator::CoeffReturnType,
+                            Eigen::SyclDevice>::size;
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
+      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+      range = total_size;
+
+      dev.template nullary_kernel_launcher<
+          typename Evaluator::CoeffReturnType,
+          ExecExprFunctorKernel<Evaluator> >(
+          evaluator,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+                                cl::sycl::range<1>(tileSize)),
+          Index(1), range);
+    }
+    evaluator.cleanup();
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 85dfc7a69..c9bccfc66 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -38,7 +38,7 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename XprTraits::PointerType PointerType;
   enum {
     Flags = 0
   };
@@ -89,6 +89,10 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar, 
+                                  typename XprTraits::PointerType
+                                  >::type 
+                                  PointerType;
 };
 
 template<typename UnaryOp, typename XprType>
@@ -161,7 +165,12 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename TypeConversion<Scalar,
+                                  typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                                      typename traits<LhsXprType>::PointerType,
+                                                      typename traits<RhsXprType>::PointerType>::type
+                                  >::type 
+                                  PointerType;
   enum {
     Flags = 0
   };
@@ -238,7 +247,12 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
   typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-
+  typedef typename TypeConversion<Scalar,
+                                  typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+                                                      typename traits<Arg2XprType>::PointerType,
+                                                      typename traits<Arg3XprType>::PointerType>::type
+                                  >::type 
+                                  PointerType;
   enum {
     Flags = 0
   };
@@ -314,6 +328,9 @@ struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
   typedef typename ElseXprType::Nested ElseNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+                               typename traits<ThenXprType>::PointerType,
+                               typename traits<ElseXprType>::PointerType>::type PointerType;
 };
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index 08eb5595a..4a1a0687c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 
-// This code requires the ability to initialize arrays of constant
-// values directly inside a class.
-#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900
-
 namespace Eigen {
 
 /** \class TensorFFT
@@ -71,6 +67,7 @@ struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
@@ -130,17 +127,24 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   typedef OutputScalar CoeffReturnType;
   typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
@@ -165,19 +169,19 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return m_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalToBuf(data);
       return false;
     } else {
-      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
+      m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
       evalToBuf(m_data);
       return true;
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_data) {
       m_device.deallocate(m_data);
       m_data = NULL;
@@ -200,11 +204,16 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
-
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_data.bind(cgh);
+  }
+#endif
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
     const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
     ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
 
@@ -230,20 +239,32 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
         // for n = 0, 1,..., line_len-1.
         // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-        pos_j_base_powered[0] = ComplexScalar(1, 0);
-        if (line_len > 1) {
-          const RealScalar pi_over_len(EIGEN_PI / line_len);
-          const ComplexScalar pos_j_base = ComplexScalar(
-	       std::cos(pi_over_len), std::sin(pi_over_len));
-          pos_j_base_powered[1] = pos_j_base;
-          if (line_len > 2) {
-            const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-            for (int j = 2; j < line_len + 1; ++j) {
-              pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
-                                      pos_j_base_powered[j - 1] /
-                                      pos_j_base_powered[j - 2] * pos_j_base_sq;
-            }
-          }
+
+        // The recurrence is correct in exact arithmetic, but causes
+        // numerical issues for large transforms, especially in
+        // single-precision floating point.
+        //
+        // pos_j_base_powered[0] = ComplexScalar(1, 0);
+        // if (line_len > 1) {
+        //   const ComplexScalar pos_j_base = ComplexScalar(
+        //       numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
+        //   pos_j_base_powered[1] = pos_j_base;
+        //   if (line_len > 2) {
+        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+        //     for (int i = 2; i < line_len + 1; ++i) {
+        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
+        //           pos_j_base_powered[i - 1] /
+        //           pos_j_base_powered[i - 2] *
+        //           pos_j_base_sq;
+        //     }
+        //   }
+        // }
+        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
+        // and cosine functions here.
+        for (int j = 0; j < line_len + 1; ++j) {
+          double arg = ((EIGEN_PI * j) * j) / line_len;
+          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
+          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
         }
       }
 
@@ -253,7 +274,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -261,7 +282,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
           }
         }
 
-        // processs the line
+        // process the line
         if (is_power_of_two) {
           processDataLineCooleyTukey(line_buf, line_len, log_len);
         }
@@ -271,7 +292,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
@@ -562,12 +583,12 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
  protected:
   Index m_size;
-  const FFT& m_fft;
+  const FFT EIGEN_DEVICE_REF m_fft;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
   TensorEvaluator<ArgType, Device> m_impl;
-  CoeffReturnType* m_data;
-  const Device& m_device;
+  EvaluatorPointerType m_data;
+  const Device EIGEN_DEVICE_REF m_device;
 
   // This will support a maximum FFT size of 2^32 for each dimension
   // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
@@ -645,7 +666,4 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
 }  // end namespace Eigen
 
-#endif  // EIGEN_HAS_CONSTEXPR
-
-
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index fcee5f60d..ca39bb855 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -20,7 +20,7 @@ namespace Eigen {
   * The fixed sized equivalent of
   * Eigen::Tensor<float, 3> t(3, 5, 7);
   * is
-  * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
+  * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
   */
 
 template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
@@ -40,11 +40,18 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+      BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
     };
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   typedef Dimensions_ Dimensions;
   static const std::size_t NumIndices = Dimensions::count;
 
@@ -333,27 +340,10 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
-    {
-      // FIXME: check that the dimensions of other match the dimensions of *this.
-      // Unfortunately this isn't possible yet when the rhs is an expression.
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    // FIXME: check that the dimensions of other match the dimensions of *this.
+    // Unfortunately this isn't possible yet when the rhs is an expression.
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
 
   protected:
     EIGEN_DEVICE_FUNC
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index bbd5eb374..e800dedc6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -19,15 +19,9 @@ namespace Eigen {
   *
   *
   */
-/// template <class> class MakePointer_ is added to convert the host pointer to the device pointer.
-/// It is added due to the fact that for our device compiler T* is not allowed.
-/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T.
-/// This is done through our MakePointer_ class. By default the Type in the MakePointer_<T> is T* .
-/// Therefore, by adding the default value, we managed to convert the type and it does not break any
-/// existing code as its default value is T*.
 namespace internal {
-template<typename XprType, template <class> class MakePointer_>
-struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -38,35 +32,31 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 
   enum {
     Flags = 0
   };
-  template <class T> struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-  };
 };
 
-template<typename XprType, template <class> class MakePointer_>
-struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
+  typedef const TensorForcedEvalOp<XprType>& type;
 };
 
-template<typename XprType, template <class> class MakePointer_>
-struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
 {
-  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
+  typedef TensorForcedEvalOp<XprType> type;
 };
 
 }  // end namespace internal
 
 
 
-template<typename XprType, template <class> class MakePointer_>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -87,49 +77,113 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi
     typename XprType::Nested m_xpr;
 };
 
+namespace internal {
+template <typename Device, typename CoeffReturnType>
+struct non_integral_type_placement_new{
+  template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+   // Initialize non-trivially constructible types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
+    }
+}
+};
+
+// SYCL does not support non-integral types 
+// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices 
+// no matching function for call to 'operator new'
+template <typename CoeffReturnType>
+struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
+  template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
+}
+};
+} // end namespace internal
 
-template<typename ArgType, typename Device, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
+template<typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
 {
-  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
+  typedef const typename internal::remove_all<ArgType_>::type ArgType;
+  typedef TensorForcedEvalOp<ArgType> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = (PacketSize > 1),
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = true
+    IsAligned         = true,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = true
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-	/// op_ is used for sycl
-      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
+  static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_op(op.expression()),
+      m_device(device), m_buffer(NULL)
   { }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     const Index numValues =  internal::array_prod(m_impl.dimensions());
-    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
-    // Should initialize the memory in case we're dealing with non POD types.
-    if (NumTraits<CoeffReturnType>::RequireInitialization) {
-      for (Index i = 0; i < numValues; ++i) {
-        new(m_buffer+i) CoeffReturnType();
-      }
-    }
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+
+   internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
+
     typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
-    EvalTo evalToTmp(m_buffer, m_op);
-    const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
-    internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, PacketAccess>::run(evalToTmp, m_device);
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    internal::TensorExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        run(evalToTmp, m_device);
+
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate(m_buffer);
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
+        numValues * sizeof(CoeffReturnType)));
+    typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
+        EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
+                             std::move(done));
+    internal::TensorAsyncExecutor<
+        const EvalTo, typename internal::remove_const<Device>::type,
+        decltype(on_done),
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+        runAsync(evalToTmp, m_device, std::move(on_done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_device.deallocate_temp(m_buffer);
     m_buffer = NULL;
   }
 
@@ -144,21 +198,37 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    assert(m_buffer != NULL);
+    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EvaluatorPointerType data() const { return m_buffer; }
 
-  /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
-  /// used by sycl in order to build the sycl buffer
-  const Device& device() const{return m_device;}
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_buffer.bind(cgh);
+    m_impl.bind(cgh);
+  }
+#endif
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
-  const Device& m_device;
-  typename MakePointer<CoeffReturnType>::Type m_buffer;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_buffer;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 52b803d7f..246ebe44e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-// MakePointer class is used as a container of the adress space of the pointer
+// MakePointer class is used as a container of the address space of the pointer
 // on the host and on the device. From the host side it generates the T* pointer
 // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
 // T* m_data on the host. It is always called on the device.
@@ -20,8 +20,35 @@ namespace Eigen {
 // map_allocator.
 template<typename T> struct MakePointer {
   typedef T* Type;
+  typedef const T* ConstType;
 };
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
+  return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator class can be specialized for a device, hence it is possible
+// to construct different types of temproray storage memory in TensorEvaluator
+// for different devices by specializing the following StorageMemory class.
+template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
+
+namespace internal{
+template<typename A, typename B> struct Pointer_type_promotion {
+  static const bool val=false;
+};
+template<typename A> struct Pointer_type_promotion<A, A> {
+  static const bool val = true;
+};
+template<typename A, typename B> struct TypeConversion {
+  typedef A* type;
+};
+}
+
+
 template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
@@ -37,7 +64,7 @@ template<typename Op, typename Dims, typename XprType, template <class> class Ma
 template<typename XprType> class TensorIndexTupleOp;
 template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
-template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
 template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
@@ -58,21 +85,50 @@ template<typename Strides, typename XprType> class TensorInflationOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 template<typename Op, typename XprType> class TensorScanOp;
+template<typename Dims, typename XprType> class TensorTraceOp;
 
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 
 template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
+template<typename XprType> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
+struct NoOpOutputKernel;
+
 struct DefaultDevice;
 struct ThreadPoolDevice;
 struct GpuDevice;
 struct SyclDevice;
 
+#ifdef EIGEN_USE_SYCL
+
+template <typename T> struct MakeSYCLPointer {
+  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
+};
+
+template <typename T>
+EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
+constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
+  return data;
+}
+
+template <typename T>
+struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
+template <typename T>
+struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
+
+namespace TensorSycl {
+namespace internal{
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
+}
+}
+#endif
+
+
 enum FFTResultType {
   RealPart = 0,
   ImagPart = 1,
@@ -98,10 +154,36 @@ struct IsVectorizable<GpuDevice, Expression> {
                             TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 
+// Tiled evaluation strategy.
+enum TiledEvaluation {
+  Off = 0,    // tiled evaluation is not supported
+  On = 1,     // still work in progress (see TensorBlock.h)
+};
+
+template <typename Device, typename Expression>
+struct IsTileable {
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static const bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess &&
+      TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+  static const TiledEvaluation value =
+      BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
+};
+
 template <typename Expression, typename Device,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value>
+          bool Vectorizable      = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
 class TensorExecutor;
 
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
+
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index d73f6dc68..d96303224 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -20,7 +20,7 @@ namespace internal {
 template <typename Scalar>
 struct scalar_mod_op {
   EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
   const Scalar m_divisor;
 };
 template <typename Scalar>
@@ -33,8 +33,8 @@ struct functor_traits<scalar_mod_op<Scalar> >
  */
 template <typename Scalar>
 struct scalar_mod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
-  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
@@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> >
 
 template <typename Scalar>
 struct scalar_fmod_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
   operator()(const Scalar& a, const Scalar& b) const {
     return numext::fmod(a, b);
@@ -54,50 +54,19 @@ struct functor_traits<scalar_fmod_op<Scalar> > {
          PacketAccess = false };
 };
 
-
-/** \internal
-  * \brief Template functor to compute the sigmoid of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
-  */
-template <typename T>
-struct scalar_sigmoid_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
-    const T one = T(1);
-    return one / (one + numext::exp(-x));
-  }
-
-  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(const Packet& x) const {
-    const Packet one = pset1<Packet>(T(1));
-    return pdiv(one, padd(one, pexp(pnegate(x))));
-  }
-};
-
-template <typename T>
-struct functor_traits<scalar_sigmoid_op<T> > {
-  enum {
-    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
-    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
-                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
-  };
-};
-
-
 template<typename Reducer, typename Device>
 struct reducer_traits {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_sum_op<T> sum_op;
     *accum = sum_op(*accum, t);
@@ -133,16 +102,14 @@ template <typename T, typename Device>
 struct reducer_traits<SumReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd,
+    IsStateful = false,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
   };
 };
 
-
 template <typename T> struct MeanReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
-  static const bool IsStateful = true;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
@@ -166,16 +133,20 @@ template <typename T> struct MeanReducer
     return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum / scalarCount_;
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(accum, T(scalarCount_));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, pset1<Packet>(packetCount_));
+    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
     internal::scalar_sum_op<T> sum_op;
-    return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(
+        sum_op(saccum, predux(vaccum)),
+        T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
   }
 
   protected:
@@ -187,7 +158,10 @@ template <typename T, typename Device>
 struct reducer_traits<MeanReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd
+    PacketAccess = PacketType<T, Device>::HasAdd &&
+                   PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
+    IsStateful = true,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
   };
 };
 
@@ -218,20 +192,19 @@ struct MinMaxBottomValue<T, false, false> {
 };
 
 
-template <typename T> struct MaxReducer
+template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMax;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t > *accum) { *accum = t; }
+    scalar_max_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmax<Packet>(*accum, p);
+    scalar_max_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+    return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -246,33 +219,34 @@ template <typename T> struct MaxReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return numext::maxi(saccum, predux_max(vaccum));
+    scalar_max_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
   }
 };
 
-template <typename T, typename Device>
-struct reducer_traits<MaxReducer<T>, Device> {
+template <typename T, typename Device, int NaNPropagation>
+    struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMax
+    PacketAccess = PacketType<T, Device>::HasMax,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
   };
 };
 
-
-template <typename T> struct MinReducer
+template <typename T, int NaNPropagation=PropagateFast> struct MinReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMin;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t < *accum) { *accum = t; }
+    scalar_min_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmin<Packet>(*accum, p);
+    scalar_min_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+    return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -287,24 +261,23 @@ template <typename T> struct MinReducer
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return numext::mini(saccum, predux_min(vaccum));
+    scalar_min_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
   }
 };
 
-template <typename T, typename Device>
-struct reducer_traits<MinReducer<T>, Device> {
+template <typename T, typename Device, int NaNPropagation>
+    struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMin
+    PacketAccess = PacketType<T, Device>::HasMin,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
   };
 };
 
-
 template <typename T> struct ProdReducer
 {
-  static const bool PacketAccess = packet_traits<T>::HasMul;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_product_op<T> prod_op;
     (*accum) = prod_op(*accum, t);
@@ -313,7 +286,6 @@ template <typename T> struct ProdReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmul<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     internal::scalar_cast_op<int, T> conv;
     return conv(1);
@@ -340,16 +312,15 @@ template <typename T, typename Device>
 struct reducer_traits<ProdReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::MulCost,
-    PacketAccess = PacketType<T, Device>::HasMul
+    PacketAccess = PacketType<T, Device>::HasMul,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 
 struct AndReducer
 {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
     *accum = *accum && t;
   }
@@ -365,15 +336,14 @@ template <typename Device>
 struct reducer_traits<AndReducer, Device> {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 
 struct OrReducer {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
     *accum = *accum || t;
   }
@@ -389,19 +359,22 @@ template <typename Device>
 struct reducer_traits<OrReducer, Device> {
   enum {
     Cost = 1,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
-
-// Argmin/Argmax reducers
+// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
+// contain the same min/max value.
 template <typename T> struct ArgMaxTupleReducer
 {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t.second > accum->second) { *accum = t; }
+    if (t.second < accum->second) {
+      return;
+    } else if (t.second > accum->second || accum->first > t.first ) {
+      *accum = t;
+    }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     return T(0, NumTraits<typename T::second_type>::lowest());
@@ -415,18 +388,21 @@ template <typename T, typename Device>
 struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
 
 template <typename T> struct ArgMinTupleReducer
 {
-  static const bool PacketAccess = false;
-  static const bool IsStateful = false;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
-    if (t.second < accum->second) { *accum = t; }
+    if (t.second > accum->second) {
+      return;
+    } else if (t.second < accum->second || accum->first > t.first) {
+      *accum = t;
+    }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     return T(0, NumTraits<typename T::second_type>::highest());
@@ -440,7 +416,9 @@ template <typename T, typename Device>
 struct reducer_traits<ArgMinTupleReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = false
+    PacketAccess = false,
+    IsStateful = false,
+    IsExactlyAssociative = true
   };
 };
 
@@ -454,6 +432,7 @@ class GaussianGenerator {
                                       const array<T, NumDims>& std_devs)
       : m_means(means)
   {
+    EIGEN_UNROLL_LOOP
     for (size_t i = 0; i < NumDims; ++i) {
       m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
     }
@@ -461,6 +440,7 @@ class GaussianGenerator {
 
   EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
+    EIGEN_UNROLL_LOOP
     for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];
       tmp += offset * offset / m_two_sigmas[i];
@@ -483,6 +463,25 @@ struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
   };
 };
 
+template <typename Scalar>
+struct scalar_clamp_op {
+  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    return numext::mini(numext::maxi(x, m_min), m_max);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& x) const {
+    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
+  }
+  const Scalar m_min;
+  const Scalar m_max;
+};
+template<typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
+
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index eb1d4934e..174bf0683 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-/** \class TensorGenerator
+/** \class TensorGeneratorOp
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor generator class.
@@ -31,6 +31,7 @@ struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Generator, typename XprType>
@@ -87,40 +88,58 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
-    BlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = true,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_generator(op.generator())
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      :  m_device(device), m_generator(op.generator())
   {
-    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
-    m_dimensions = impl.dimensions();
+    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+    m_dimensions = argImpl.dimensions();
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_strides[0] = 1;
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims - 1] = 1;
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
@@ -133,7 +152,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -145,6 +164,97 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.firstLevelCacheSize();
+    // TODO(ezhulenev): Generator should have a cost.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+        target_size);
+  }
+
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+
+    // Offset in the output block buffer.
+    Index offset = 0;
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+    const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      // Generate data for the vectorized part of the inner-most dimension.
+      for (; i <= inner_dim_vectorized; i += packet_size) {
+        for (Index j = 0; j < packet_size; ++j) {
+          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
+          j_coords[inner_dim] += j;
+          *(block_buffer + offset + i + j) = m_generator(j_coords);
+        }
+        coords[inner_dim] += packet_size;
+      }
+      // Finalize non-vectorized part of the inner-most dimension.
+      for (; i < inner_dim_size; ++i) {
+        *(block_buffer + offset + i) = m_generator(coords);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if (NumDims == 1) break;
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] =
+            initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool) const {
     // TODO(rmlarsen): This is just a placeholder. Define interface to make
@@ -153,21 +263,26 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
                                   TensorOpCost::MulCost<Scalar>());
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_strides[i];
+        const Index idx = index / m_fast_strides[i];
         index -= idx * m_strides[i];
         coords[i] = idx;
       }
       coords[0] = index;
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_strides[i];
+        const Index idx = index / m_fast_strides[i];
         index -= idx * m_strides[i];
         coords[i] = idx;
       }
@@ -175,8 +290,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     }
   }
 
+  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
   Generator m_generator;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
new file mode 100644
index 000000000..cb53ce298
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by 
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) assert(COND)
+#endif
+
+#endif // gpu_assert
+
+#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
new file mode 100644
index 000000000..1d142f2ee
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t 
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 566856ed2..dd51850b7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -27,6 +27,7 @@ namespace Eigen {
   * patch_cols, and 1 for all the additional dimensions.
   */
 namespace internal {
+
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
 {
@@ -38,6 +39,7 @@ struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -52,6 +54,66 @@ struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorIm
   typedef TensorImagePatchOp<Rows, Cols, XprType> type;
 };
 
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+      Scalar* dst_data, const Index src_index) {
+    const Impl& impl = self.impl();
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Index vectorized_size =
+        (num_coeff_to_copy / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      Packet p = impl.template packet<Unaligned>(src_index + i);
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+      const Index num_coeff_to_pad, const Scalar padding_value,
+      const Index dst_index, Scalar* dst_data) {
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Packet padded_packet = internal::pset1<Packet>(padding_value);
+    const Index vectorized_size =
+        (num_coeff_to_pad / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
+                                                   padded_packet);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+      dst_data[dst_index + i] = padding_value;
+    }
+  }
+};
+
 }  // end namespace internal
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -70,12 +132,12 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                            DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                            DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                            PaddingType padding_type, Scalar padding_value)
-      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-        m_padding_type(padding_type), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+                                                           m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex row_strides, DenseIndex col_strides,
@@ -84,13 +146,14 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-        m_padding_left(padding_left), m_padding_right(padding_right),
-        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+                                                           m_padding_left(padding_left), m_padding_right(padding_right),
+                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_rows() const { return m_patch_rows; }
@@ -161,18 +224,26 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   typedef TensorEvaluator<ArgType, Device> Impl;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = false,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -238,9 +309,15 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
           // Calculate the padding
           m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
           m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          // The padding size calculation for PADDING_SAME has been updated to
+          // be consistent with how TensorFlow extracts its paddings.
+          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
+          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
           break;
         default:
           eigen_assert(false && "unexpected padding");
+          m_outputCols=0; // silence the uninitialised warning;
+          m_outputRows=0; //// silence the uninitialised warning;
       }
     }
     eigen_assert(m_outputRows > 0);
@@ -312,12 +389,20 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -418,20 +503,27 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     return packetWithPossibleZero(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -449,6 +541,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -500,6 +593,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   Scalar m_paddingValue;
 
+  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 3209fecd3..2d8c7b903 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -37,36 +37,36 @@ namespace Eigen {
   * \sa Tensor
   */
 
-template <DenseIndex n>
+template <Index n>
 struct type2index {
-  static const DenseIndex value = n;
-  EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; }
-  EIGEN_DEVICE_FUNC void set(DenseIndex val) {
+  static const Index value = n;
+  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
+  EIGEN_DEVICE_FUNC void set(Index val) {
     eigen_assert(val == n);
   }
 };
 
 // This can be used with IndexPairList to get compile-time constant pairs,
 // such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct type2indexpair {
-  static const DenseIndex first = f;
-  static const DenseIndex second = s;
+  static const Index first = f;
+  static const Index second = s;
 
-  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
-    return IndexPair<DenseIndex>(f, s);
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
+    return IndexPair<Index>(f, s);
   }
 
-  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
     eigen_assert(val.first == f);
     eigen_assert(val.second == s);
   }
 };
 
 
-template<DenseIndex n> struct NumTraits<type2index<n> >
+template<Index n> struct NumTraits<type2index<n> >
 {
-  typedef DenseIndex Real;
+  typedef Index Real;
   enum {
     IsComplex = 0,
     RequireInitialization = false,
@@ -75,28 +75,28 @@ template<DenseIndex n> struct NumTraits<type2index<n> >
     MulCost = 1
   };
 
-  EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
-  EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
-  EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
 };
 
 namespace internal {
 template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {
-  val = new_val;
+EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
+  val = internal::convert_index<T>(new_val);
 }
-template <DenseIndex n>
-EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
+template <Index n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
   val.set(new_val);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
   val = new_val;
 }
-template <DenseIndex f, DenseIndex s>
-EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+template <Index f, Index s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
   val.set(new_val);
 }
 
@@ -106,36 +106,36 @@ struct is_compile_time_constant {
   static constexpr bool value = false;
 };
 
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<type2index<idx> > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<const type2index<idx> > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<type2index<idx>& > {
   static constexpr bool value = true;
 };
-template <DenseIndex idx>
+template <Index idx>
 struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<const type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
-template <DenseIndex f, DenseIndex s>
+template <Index f, Index s>
 struct is_compile_time_constant<const type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
@@ -228,15 +228,15 @@ template <typename T, typename... O>
 
 
 
-template <DenseIndex Idx, typename ValueT>
+template <Index Idx, typename ValueT>
 struct tuple_coeff {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
     //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
     return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
     if (i == Idx) {
       update_value(array_get<Idx>(t), value);
     } else {
@@ -245,7 +245,7 @@ struct tuple_coeff {
   }
 
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
     return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
         tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
@@ -268,18 +268,18 @@ struct tuple_coeff {
 template <typename ValueT>
 struct tuple_coeff<0, ValueT> {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
     return array_get<0>(t)/* * (i == 0)*/;
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
     eigen_assert (i == 0);
     update_value(array_get<0>(t), value);
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>&) {
-    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value & (i == 0);
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
   }
 
   template <typename... T>
@@ -298,32 +298,43 @@ struct tuple_coeff<0, ValueT> {
 
 template<typename FirstType, typename... OtherTypes>
 struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
   EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
   EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
   }
   EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
   }
 
   EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os,
+                         const IndexList<FirstType, OtherTypes...>& dims) {
+  os << "[";
+  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
 
 template<typename FirstType, typename... OtherTypes>
 constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
@@ -333,26 +344,28 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
 
 template<typename FirstType, typename... OtherTypes>
 struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
   EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
   }
 };
 
 namespace internal {
 
-template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
-  size_t result = 1;
-  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+template<typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  Index result = 1;
+  EIGEN_UNROLL_LOOP
+  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
     result *= sizes[i];
   }
   return result;
@@ -372,30 +385,30 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
   static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
 };
 
-template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
-template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
 
 template <typename T>
 struct index_known_statically_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
@@ -447,14 +460,14 @@ template <typename FirstType, typename... OtherTypes>
 
 template <typename Tx>
 struct index_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
@@ -462,7 +475,7 @@ struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
@@ -471,14 +484,14 @@ struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_ne_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
@@ -486,7 +499,7 @@ struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
@@ -495,14 +508,14 @@ struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_gt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
@@ -510,7 +523,7 @@ struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
@@ -520,14 +533,14 @@ struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_lt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
@@ -535,7 +548,7 @@ struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
@@ -545,14 +558,14 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename Tx>
 struct index_pair_first_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
@@ -560,7 +573,7 @@ struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes..
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
@@ -570,14 +583,14 @@ struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherT
 
 template <typename Tx>
 struct index_pair_second_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
@@ -585,7 +598,7 @@ struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes.
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
@@ -602,7 +615,7 @@ namespace internal {
 
 template <typename T>
 struct index_known_statically_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
     return false;
   }
 };
@@ -623,42 +636,42 @@ struct indices_statically_known_to_increase_impl {
 
 template <typename T>
 struct index_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_ne_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_gt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_lt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename Tx>
 struct index_pair_first_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
 
 template <typename Tx>
 struct index_pair_second_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
     return false;
   }
 };
@@ -674,7 +687,7 @@ struct index_pair_second_statically_eq_impl {
 namespace Eigen {
 namespace internal {
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
   return index_known_statically_impl<T>::run(i);
 }
 
@@ -689,32 +702,32 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increa
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
   return index_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
   return index_statically_ne_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
   return index_statically_gt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
   return index_statically_lt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
   return index_pair_first_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
   return index_pair_second_statically_eq_impl<T>::run(i, value);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index f391fb9ee..c5cb61af5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -31,6 +31,7 @@ struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
@@ -84,18 +85,25 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())
   {
     m_dimensions = m_impl.dimensions();
@@ -129,11 +137,11 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -144,6 +152,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     eigen_assert(index < dimensions().TotalSize());
     *inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -158,6 +167,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
       *inputIndex += index / m_strides[0];
       return true;
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -193,6 +203,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -213,7 +224,14 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
                         compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 33edc49e3..26a3818f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -32,7 +32,7 @@ struct Initializer {
                   Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
                   const InitList& vals) {
     int i = 0;
-    for (auto v : vals) {
+    for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - N] = i++;
       Initializer<Derived, N - 1>::run(tensor, indices, v);
     }
@@ -48,7 +48,7 @@ struct Initializer<Derived, 1> {
                   const InitList& vals) {
     int i = 0;
     // There is likely a faster way to do that than iterating.
-    for (auto v : vals) {
+    for (const auto& v : vals) {
       (*indices)[traits<Derived>::NumDimensions - 1] = i++;
       tensor.coeffRef(*indices) = v;
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ede3939c2..6d5cce4aa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * \brief Fast integer division by a constant.
   *
   * See the paper from Granlund and Montgomery for explanation.
-  *   (at http://dx.doi.org/10.1145/773473.178249)
+  *   (at https://doi.org/10.1145/773473.178249)
   *
   * \sa Tensor
   */
@@ -35,8 +35,10 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
     unsigned long index;
     _BitScanReverse(&index, val);
@@ -51,8 +53,10 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
   {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return __clzll(val);
+#elif defined(SYCL_DEVICE_ONLY)
+    return static_cast<int>(cl::sycl::clz(val));
 #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
     unsigned long index;
     _BitScanReverse64(&index, val);
@@ -86,8 +90,10 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umulhi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
     return (static_cast<uint64_t>(a) * b) >> 32;
 #endif
@@ -95,9 +101,11 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __umul64hi(a, b);
-#elif defined(__SIZEOF_INT128__)
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
+#elif EIGEN_HAS_BUILTIN_INT128
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
 #else
@@ -116,7 +124,7 @@ namespace {
   template <typename T>
   struct DividerHelper<64, T> {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
+#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
@@ -159,7 +167,7 @@ struct TensorIntDivisor {
     shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
-  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
@@ -195,8 +203,10 @@ class TensorIntDivisor<int32_t, true> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef __CUDA_ARCH__
+#ifdef EIGEN_GPU_COMPILE_PHASE
     return (__umulhi(magic, n) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
 #else
     uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
     return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index cd0109ef4..80106c1a0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -46,6 +46,7 @@ struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename XprType>
@@ -68,39 +69,22 @@ template<typename XprType>
 class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+    typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
-      : m_xpr(expr) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+        : m_xpr(expr) {}
 
     EIGEN_DEVICE_FUNC
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
-    {
-      typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
   protected:
     typename XprType::Nested m_xpr;
 };
@@ -118,12 +102,18 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     for(int i = 0; i < NumDims; ++i) {
@@ -131,16 +121,25 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     }
   }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -159,7 +158,9 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+    return constCast(m_impl.data());
+  }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
@@ -180,11 +181,17 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index ee0078bbc..73ff3d2db 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
  */
 
 // SFINAE requires variadic templates
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
@@ -43,12 +43,56 @@
 #define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
     typename internal::enable_if< ( __condition__ ) , int >::type = 0
 
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+  #define EIGEN_DEVICE_REF
+#else
+  #define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#define EIGEN_SYCL_TRY_CATCH(X) \
+  do { \
+    EIGEN_TRY {X;} \
+    EIGEN_CATCH(const cl::sycl::exception& e) { \
+      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
+                                       std::string(__FILE__) + ":" + \
+                                       std::to_string(__LINE__) + "\n" + \
+                                       e.what())); \
+    } \
+  } while (false)
 
-#if EIGEN_HAS_CONSTEXPR
-#define EIGEN_CONSTEXPR constexpr
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+     (defined(EIGEN_SYCL_LOCAL_MEM) &&  defined(EIGEN_SYCL_NO_LOCAL_MEM))
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
+#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
 #else
-#define EIGEN_CONSTEXPR
+  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
 #endif
 
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
 
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index a8e55757e..6834c97e4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -12,37 +12,57 @@
 
 namespace Eigen {
 
+// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+
 /** \class TensorMap
   * \ingroup CXX11_Tensor_Module
   *
   * \brief A tensor expression mapping an existing array of data.
   *
   */
-/// template <class> class MakePointer_ is added to convert the host pointer to the device pointer.
-/// It is added due to the fact that for our device compiler T* is not allowed.
-/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T.
-/// This is done through our MakePointer_ class. By default the Type in the MakePointer_<T> is T* .
+/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler `T*` is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
+/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
 /// Therefore, by adding the default value, we managed to convert the type and it does not break any
-/// existing code as its default value is T*.
+/// existing code as its default value is `T*`.
 template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
 {
   public:
     typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
-    typedef typename PlainObjectType::Base Base;
-    typedef typename Eigen::internal::nested<Self>::type Nested;
-    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
+  #ifdef EIGEN_USE_SYCL
+    typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
+  #else
+     typedef typename Eigen::internal::nested<Self>::type Nested;
+  #endif
+   typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
+    typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
 
-  /*    typedef typename internal::conditional<
-                         bool(internal::is_lvalue<PlainObjectType>::value),
-                         Scalar *,
-                         const Scalar *>::type
-                     PointerType;*/
     typedef typename MakePointer_<Scalar>::Type PointerType;
-    typedef PointerType PointerArgType;
+    typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+    // WARN: PointerType still can be a pointer to const (const Scalar*), for
+    // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+    // expression should be illegal, but adding this restriction is not possible
+    // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        PointerType,      // use simple pointer in lvalue expressions
+        PointerConstType  // use const pointer in rvalue expressions
+        >::type StoragePointerType;
+
+    // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+    // we should return a reference to const from operator() (and others), even
+    // if TensorMap itself is not const.
+    typedef typename internal::conditional<
+        bool(internal::is_lvalue<PlainObjectType>::value),
+        Scalar&,
+        const Scalar&
+        >::type StorageRefType;
 
     static const int Options = Options_;
 
@@ -57,47 +77,47 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     };
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
       EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
       EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
       EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
       EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
     template <typename Dimensions>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
@@ -114,12 +134,12 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE PointerType data() { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const PointerType data() const { return m_data; }
+    EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -132,14 +152,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()() const
+    EIGEN_STRONG_INLINE StorageRefType operator()() const
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -147,9 +167,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+      eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
@@ -160,7 +181,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i1 + i0 * m_dimensions[1];
@@ -171,7 +192,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
     {
       if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -182,7 +203,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -193,7 +214,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -206,7 +227,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 #endif
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -219,14 +240,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()()
+    EIGEN_STRONG_INLINE StorageRefType operator()()
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -234,9 +255,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
       static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       const std::size_t NumDims = sizeof...(otherIndices) + 2;
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
@@ -248,7 +270,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i1 + i0 * m_dimensions[1];
@@ -259,7 +281,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -270,7 +292,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -281,7 +303,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -293,26 +315,10 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #endif
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
-    {
-      typedef TensorAssignOp<Self, const Self> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Self& operator=(const OtherDerived& other)
-    {
-      typedef TensorAssignOp<Self, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
 
   private:
-    typename MakePointer_<Scalar>::Type m_data;
+    StoragePointerType m_data;
     Dimensions m_dimensions;
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index 615559d44..a6181d35e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,11 +52,13 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
-template <>
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
+
+typedef ulonglong2 Packet4h2;
+template<>
 struct PacketType<half, GpuDevice> {
-  typedef half2 type;
-  static const int size = 2;
+  typedef Packet4h2 type;
+  static const int size = 8;
   enum {
     HasAdd    = 1,
     HasSub    = 1,
@@ -75,6 +77,7 @@ struct PacketType<half, GpuDevice> {
     HasSqrt   = 1,
     HasRsqrt  = 1,
     HasExp    = 1,
+    HasExpm1  = 0,
     HasLog    = 1,
     HasLog1p  = 0,
     HasLog10  = 0,
@@ -84,9 +87,57 @@ struct PacketType<half, GpuDevice> {
 #endif
 
 #if defined(EIGEN_USE_SYCL)
-template <typename T>
-  struct PacketType<T, SyclDevice> {
-  typedef T type;
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B> struct PlusOp {
+  static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B> struct DivOp {
+  static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step,
+          template <class Indx, Indx...> class StepOp>
+struct static_for {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+    op(start);
+    static_for<Index, StepOp<Index, start, step>::Value, end, step,
+               StepOp>::loop(op);
+  }
+};
+template <typename Index, Index end, Index step,
+          template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+  static const int PacketSize = 1;
+  typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+  static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
+  return ((((x) + (y)-1) / (y)) * (y));
+}
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <>
+  struct PacketType<half, SyclDevice> {
+  typedef half type;
   static const int size = 1;
   enum {
     HasAdd    = 0,
@@ -103,8 +154,58 @@ template <typename T>
     HasBlend  = 0
   };
 };
-#endif
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+  typedef Scalar type;
+  typedef Scalar half;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
+  };
+  enum {
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0
+  };
+
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
+template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
+{\
+  typedef typename internal::packet_traits<Type>::type type;\
+  typedef typename internal::packet_traits<Type>::half half;\
+};
+
+
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
 
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
+template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
+#endif
+#endif
 
 // Tuple mimics std::pair but works on e.g. nvcc.
 template <typename U, typename V> struct Tuple {
@@ -122,14 +223,6 @@ template <typename U, typename V> struct Tuple {
   Tuple(const U& f, const V& s) : first(f), second(s) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Tuple& operator= (const Tuple& rhs) {
-    if (&rhs == this) return *this;
-    first = rhs.first;
-    second = rhs.second;
-    return *this;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void swap(Tuple& rhs) {
     using numext::swap;
     swap(first, rhs.first);
@@ -168,12 +261,12 @@ template <typename Idx> struct IndexPair {
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
 
-  template<typename IndexType, Index... Is>
+  template<typename IndexType, typename Index, Index... Is>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
     return { idx[Is]... };
   }
-  template<typename IndexType>
+  template<typename IndexType, typename Index>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
     return array<Index, 0>();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index d34f1e328..b3f00f77a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -31,12 +31,13 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<NewDimensions>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename NewDimensions, typename XprType>
 struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
 {
-  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
+  typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename NewDimensions, typename XprType>
@@ -53,6 +54,7 @@ template<typename NewDimensions, typename XprType>
 class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
 {
   public:
+  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
   typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
@@ -69,24 +71,7 @@ class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, Xpr
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -101,15 +86,63 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
   typedef NewDimensions Dimensions;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+
+  static const int NumOutputDims = internal::array_size<Dimensions>::value;
+  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+  enum ReshapingKind {
+    // We do not use layout information to determine reshaping kind.
+    // Depending on the layout `N` can be inner or outer dimension.
+    OneByN = 0,  // expr.reshape(1, N)
+    NByOne = 1,  // expr.reshape(N, 1)
+    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
+  };
+
+  // clang-format off
+  static const ReshapingKind kind =
+#if defined(EIGEN_HAS_INDEX_LIST)
+        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+      : Runtime;
+#else
+        Runtime;
+#endif
+  // clang-format on
+
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    // For trivial reshapes with raw access to underlying data we will provide
+    // zero overhead block access.
+    // TODO(ezhulenev): Consider adding block access without raw access?
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess &&
+                        NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef
+      typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
+                                                 Layout, Index>
+          TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   {
     // The total size of the reshaped tensor must be equal to the total size
@@ -117,17 +150,20 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -146,10 +182,53 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  // required in block(OutputTensorBlock* output_block) const
+  // For C++03 compatibility this must be defined outside the method
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_impl.data() != NULL);
+    eigen_assert((kind == Runtime) ||
+                 (kind == OneByN && desc.dimensions()[0] == 1) ||
+                 (kind == NByOne && desc.dimensions()[1] == 1));
+
+    if (kind == OneByN || kind == NByOne) {
+      // We can guarantee at compile time that block is just a contiguous slice
+      // of the underlying expression memory buffer.
+      return TensorBlock(internal::TensorBlockKind::kView,
+                           m_impl.data() + desc.offset(), desc.dimensions());
+    } else {
+      // This will do additional runtime checks, and in the end it might be also
+      // a view, or it might be a block materialized in the temporary buffer.
+      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
+                                        scratch);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+    return constCast(m_impl.data());
+  }
 
   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
+  #ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+  #endif
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
   NewDimensions m_dimensions;
@@ -167,14 +246,16 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef NewDimensions Dimensions;
 
   enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = false,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
@@ -183,15 +264,38 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
+      TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(index);
   }
+
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    assert(this->m_impl.data() != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<
+        Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(),
+                                  internal::strides<Layout>(this->dimensions()),
+                                  this->m_impl.data(), desc.offset()),
+        block.expr());
+  }
 };
 
 
@@ -214,12 +318,13 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
 struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
 {
-  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
@@ -236,6 +341,7 @@ template<typename StartIndices, typename Sizes, typename XprType>
 class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
 {
   public:
+  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base;
   typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
@@ -254,25 +360,7 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -283,9 +371,12 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 
 // Fixme: figure out the exact threshold
 namespace {
-template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
+template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
+    const bool prefer_block_evaluation = BlockAccess && total > 32*1024;
+    return !prefer_block_evaluation && contiguous > threshold_;
+  }
 
  private:
   Index threshold_;
@@ -294,11 +385,21 @@ template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
 // It is very expensive to start the memcpy kernel on GPU: we therefore only
 // use it for large copies.
 #ifdef EIGEN_USE_GPU
-template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
 };
 #endif
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_SYCL
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
+  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+};
+#endif
+
 }
 
 // Eval as rvalue
@@ -308,23 +409,56 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                        // FIXME: Temporary workaround for bug in slicing of bool tensors.
+                        !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  // Tensor slicing does not change the block type.
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
-    for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+    m_is_identity = true;
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >=
+                   op.sizes()[i] + op.startIndices()[i]);
+      if (m_impl.dimensions()[i] != op.sizes()[i] ||
+          op.startIndices()[i] != 0) {
+        m_is_identity = false;
+      }
     }
 
+    // No strides for scalars.
+    if (NumDims == 0) return;
+
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Sizes& output_dims = op.sizes();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -337,7 +471,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = 1;
@@ -349,23 +483,17 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Sizes Dimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data()) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
+        && data && m_impl.data()) {
       Index contiguous_values = 1;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = 0; i < NumDims; ++i) {
@@ -383,12 +511,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
-      if (trigger(contiguous_values)) {
-        Scalar* src = (Scalar*)m_impl.data();
-        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
+        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
+        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
           Index offset = srcCoeff(i);
-          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
+          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
         }
         return false;
       }
@@ -396,25 +524,42 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
+    if (m_is_identity) {
+      return m_impl.template packet<LoadMode>(index);
+    }
+
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -426,6 +571,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       inputIndices[0] += (indices[0] + m_offsets[0]);
       inputIndices[1] += (indices[1] + m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -445,6 +591,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -454,12 +601,28 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+    TensorBlock block = m_impl.block(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+    return block;
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
-    Scalar* result = m_impl.data();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
     if (result) {
       Index offset = 0;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -493,12 +656,19 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
     return NULL;
   }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -506,6 +676,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       }
       inputIndex += (index + m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -520,8 +691,9 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
+  bool m_is_identity;
   const StartIndices m_offsets;
 };
 
@@ -535,36 +707,55 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Sizes Dimensions;
 
+  enum {
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,
+    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    if (this->m_is_identity) {
+      this->m_impl.template writePacket<StoreMode>(index, x);
+      return;
+    }
+
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -576,6 +767,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       inputIndices[0] += (indices[0] + this->m_offsets[0]);
       inputIndices[1] += (indices[1] + this->m_offsets[0]);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -595,14 +787,20 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
     }
   }
-};
-
 
+  template<typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+    this->m_impl.writeBlock(arg_desc, block);
+  }
+};
 
 namespace internal {
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -616,12 +814,13 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -637,6 +836,7 @@ template<typename StartIndices, typename StopIndices, typename Strides, typename
 class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
 {
   public:
+  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base;
   typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
@@ -660,26 +860,7 @@ class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartI
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(
-          assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -694,6 +875,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 {
   typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
   static const int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef Strides Dimensions;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -701,43 +889,58 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_strides(op.strides())
   {
     // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
-    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
-    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
+    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
-      if(m_strides[i]>0){
-        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
-        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
-      }else{
-        /* implies m_strides[i]<0 by assert */
-        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
-        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      if (m_strides[i] > 0) {
+        startIndicesClamped[i] =
+            clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] =
+            clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      } else {
+        /* implies m_strides[i] < 0 by assert */
+        startIndicesClamped[i] =
+            clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] =
+            clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
       }
       m_startIndices[i] = startIndicesClamped[i];
     }
 
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+    const InputDimensions& input_dims = m_impl.dimensions();
 
-    // check for degenerate intervals and compute output tensor shape
-    bool degenerate = false;;
-    for(int i = 0; i < NumDims; i++){
+    // compute output tensor shape
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; i++) {
       Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
-      if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
         m_dimensions[i] = 0;
-        degenerate = true;
-      }else{
-        m_dimensions[i] = interval / m_strides[i]
-                          + (interval % m_strides[i] != 0 ? 1 : 0);
+      } else {
+        m_dimensions[i] =
+            (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
         eigen_assert(m_dimensions[i] >= 0);
       }
+      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
+        m_is_identity = false;
+      }
     }
+
     Strides output_dims = m_dimensions;
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -754,8 +957,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = m_strides[NumDims-1];
@@ -770,58 +972,58 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
-    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
-                                          device.lastLevelCacheSize() /
-                                          sizeof(Scalar));
   }
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Strides Dimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
     return NULL;
   }
-
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
         index -= idx * m_outputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
@@ -831,20 +1033,24 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     return inputIndex;
   }
 
-  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef SYCL_DEVICE_ONLY
     return numext::maxi(min, numext::mini(max,value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
   }
 
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  bool m_is_identity;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   DSizes<Index, NumDims> m_startIndices; // clamped startIndices
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
-  std::size_t m_block_total_size_max;
 };
 
 // Eval as lvalue
@@ -860,25 +1066,33 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 647bcf108..ee44382cf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -31,6 +31,7 @@ struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprT
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PaddingDimensions, typename XprType>
@@ -90,18 +91,33 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = true,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = true,
-    RawAccess = false
+    IsAligned         = true,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = true,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
   {
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
@@ -135,11 +151,20 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -148,6 +173,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -161,6 +187,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
       inputIndex += (index - m_padding[0].first);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -189,18 +216,298 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i)
         updateCostPerDimension(cost, i, i == 0);
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i)
         updateCostPerDimension(cost, i, i == NumDims - 1);
     }
     return cost;
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+        m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // If one of the dimensions is zero, return empty block view.
+    if (desc.size() == 0) {
+      return TensorBlock(internal::TensorBlockKind::kView, NULL,
+                           desc.dimensions());
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[inner_dim_idx] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides =
+        internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const Index input_inner_dim_size =
+        static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+                           output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
+                                             output_inner_pad_before_size),
+                     Index(0)));
+
+    eigen_assert(output_inner_copy_size >= 0);
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size -
+         output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size +
+                  output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Prepare storage for the materialized padding result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+
+    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
+    // single logical inner dimension.
+
+    // When possible we squeeze writes for the innermost (only if non-padded)
+    // dimension with the first padded dimension. This allows to reduce the
+    // number of calls to LinCopy and better utilize vector instructions.
+    const bool squeeze_writes =
+        NumDims > 1 &&
+        // inner dimension is not padded
+        (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
+        // and equal to the block inner dimension
+        (input_inner_dim_size == output_inner_dim_size);
+
+    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
+
+    // Maximum coordinate on a squeeze dimension that we can write to.
+    const Index squeeze_max_coord =
+        squeeze_writes ? numext::mini(
+                             // max non-padded element in the input
+                             static_cast<Index>(m_dimensions[squeeze_dim] -
+                                                m_padding[squeeze_dim].second),
+                             // max element in the output buffer
+                             static_cast<Index>(output_offsets[squeeze_dim] +
+                                                desc.dimension(squeeze_dim)))
+                       : static_cast<Index>(0);
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size;) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill single innermost dimension with padding value.
+        size += output_inner_dim_size;
+
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+            typename LinCopy::Src(0, 0, &m_paddingValue),
+            output_inner_dim_size);
+
+
+      } else if (squeeze_writes) {
+        // Squeeze multiple reads from innermost dimensions.
+        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
+        size += output_inner_dim_size * squeeze_num;
+
+        // Copy `squeeze_num` inner dimensions from input to output.
+        LinCopy::template Run<LinCopy::Kind::Linear>(
+            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+            typename LinCopy::Src(input_offset, 1, m_impl.data()),
+            output_inner_dim_size * squeeze_num);
+
+        // Update iteration state for only `squeeze_num - 1` processed inner
+        // dimensions, because we have another iteration state update at the end
+        // of the loop that will update iteration state for the last inner
+        // processed dimension.
+        it[0].count += (squeeze_num - 1);
+        input_offset += it[0].input_stride * (squeeze_num - 1);
+        output_offset += it[0].output_stride * (squeeze_num - 1);
+        output_coord[squeeze_dim] += (squeeze_num - 1);
+
+      } else {
+        // Single read from innermost dimension.
+        size += output_inner_dim_size;
+
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(in, 1, m_impl.data()),
+              output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size +
+                            output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(
+              typename LinCopy::Dst(out, 1, block_storage.data()),
+              typename LinCopy::Src(0, 0, &m_paddingValue),
+              output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : count(0),
+          size(0),
+          input_stride(0),
+          input_span(0),
+          output_stride(0),
+          output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
       Index index, int dim_index) const {
 #if defined(EIGEN_HAS_INDEX_LIST)
@@ -262,22 +569,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index initialIndex = index;
     Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index first = index;
-      const Index last = index + PacketSize - 1;
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
 
-      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -289,21 +597,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index last = index + PacketSize - 1;
-    const Index first = index;
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
     const Index lastPaddedLeft = m_padding[0].first;
     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
     const Index lastPaddedRight = m_outputStrides[1];
 
-    if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[0].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -319,23 +627,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index initialIndex = index;
     Index inputIndex = 0;
-
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
-      const Index first = index;
-      const Index last = index + PacketSize - 1;
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
 
-      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i+1];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -347,21 +655,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index last = index + PacketSize - 1;
-    const Index first = index;
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
     const Index lastPaddedLeft = m_padding[NumDims-1].first;
     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
     const Index lastPaddedRight = m_outputStrides[NumDims-1];
 
-    if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[NumDims-1].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -373,6 +681,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -387,6 +696,8 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   PaddingDimensions m_padding;
 
   Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 886a254f6..413d25dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -31,6 +31,7 @@ struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PatchDim, typename XprType>
@@ -87,18 +88,26 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
  };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     Index num_patches = 1;
@@ -143,12 +152,12 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -161,6 +170,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -169,6 +179,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
         inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -196,6 +207,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
     Index inputIndices[2] = {0, 0};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -211,6 +223,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
         inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -237,6 +250,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
       EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -253,7 +267,14 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh); 
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
@@ -262,6 +283,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   array<Index, NumDims-1> m_patchStrides;
 
   TensorEvaluator<ArgType, Device> m_impl;
+
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 1655a813e..37c1d1c3d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -16,50 +17,23 @@ namespace internal {
 namespace {
 
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
-  assert(threadIdx.z == 0);
-  return clock64() +
-      blockIdx.x * blockDim.x + threadIdx.x +
-      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
-
-#elif defined _WIN32
-  // Use the current time as a baseline.
-  SYSTEMTIME st;
-  GetSystemTime(&st);
-  int time = st.wSecond + 1000 * st.wMilliseconds;
-  // Mix in a random number to make sure that we get different seeds if
-  // we try to generate seeds faster than the clock resolution.
-  // We need 2 random values since the generator only generate 16 bits at
-  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
-  return rnd;
-
-#elif defined __APPLE__
-  // Same approach as for win32, except that the random number generator
-  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
-  uint64_t rnd = ::random() ^ mach_absolute_time();
-  return rnd;
-
+  gpu_assert(threadIdx.z == 0);
+  return blockIdx.x * blockDim.x + threadIdx.x 
+         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #else
-  // Augment the current time with pseudo random number generation
-  // to ensure that we get different seeds if we try to generate seeds
-  // faster than the clock resolution.
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  uint64_t rnd = ::random() ^ ts.tv_nsec;
-  return rnd;
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
 #endif
 }
 
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
   // TODO: Unify with the implementation in the non blocking thread pool.
   uint64_t current = *state;
   // Update the internal state
-  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
   // Generate the random output (using the PCG-XSH-RS scheme)
   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
@@ -73,34 +47,42 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state) {
-  unsigned rnd = PCG_XSH_RS_generator(state);
+T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
   return static_cast<T>(rnd);
 }
 
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
-  Eigen::half result;
-  // Generate 10 random bits for the mantissa
-  unsigned rnd = PCG_XSH_RS_generator(state);
-  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
-  // Set the exponent
-  result.x |= (static_cast<uint16_t>(15) << 10);
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+  // Generate 10 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
   // Return the final result
   return result - Eigen::half(1.0f);
 }
 
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
+
+  // Generate 7 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
+  // Return the final result
+  return result - Eigen::bfloat16(1.0f);
+}
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state) {
+float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint32_t raw;
     float fp;
   } internal;
   internal result;
   // Generate 23 random bits for the mantissa mantissa
-  const unsigned rnd = PCG_XSH_RS_generator(state);
+  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
   result.raw = rnd & 0x7fffffu;
   // Set the exponent
   result.raw |= (static_cast<uint32_t>(127) << 23);
@@ -109,7 +91,7 @@ float RandomToTypeUniform<float>(uint64_t* state) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state) {
+double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint64_t raw;
     double dp;
@@ -118,9 +100,9 @@ double RandomToTypeUniform<double>(uint64_t* state) {
   result.raw = 0;
   // Generate 52 random bits for the mantissa
   // First generate the upper 20 bits
-  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
   // The generate the lower 32 bits
-  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
   // Set the exponent
   result.raw |= (static_cast<uint64_t>(1023) << 52);
@@ -129,14 +111,14 @@ double RandomToTypeUniform<double>(uint64_t* state) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
-  return std::complex<float>(RandomToTypeUniform<float>(state),
-                             RandomToTypeUniform<float>(state));
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
+  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
+                             RandomToTypeUniform<float>(state, stream));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
-  return std::complex<double>(RandomToTypeUniform<double>(state),
-                              RandomToTypeUniform<double>(state));
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
+  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
+                              RandomToTypeUniform<double>(state, stream));
 }
 
 template <typename T> class UniformRandomGenerator {
@@ -147,17 +129,42 @@ template <typename T> class UniformRandomGenerator {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefor, we need two step to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       const UniformRandomGenerator& other) {
     m_state = other.m_state;
+    #ifdef EIGEN_USE_SYCL
+     m_exec_once =other.m_exec_once;
+    #endif
   }
 
   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    uint64_t local_state = m_state + i;
-    T result = RandomToTypeUniform<T>(&local_state);
-    m_state = local_state;
+    #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    T result = RandomToTypeUniform<T>(&m_state, i);
     return result;
   }
 
@@ -165,16 +172,25 @@ template <typename T> class UniformRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    uint64_t local_state = m_state + i;
+      #ifdef EIGEN_USE_SYCL
+      if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+       m_state += (i * 6364136223846793005ULL);
+       m_exec_once =true;
+      }
+    #endif
+    EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeUniform<T>(&local_state);
+      values[j] = RandomToTypeUniform<T>(&m_state, i);
     }
-    m_state = local_state;
     return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
+  #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 template <typename Scalar>
@@ -190,14 +206,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > {
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state) {
+T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
   // Use the ratio of uniform method to generate numbers following a normal
   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
   // details.
   T u, v, q;
   do {
-    u = RandomToTypeUniform<T>(state);
-    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    u = RandomToTypeUniform<T>(state, stream);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
     const T x = u - T(0.449871);
     const T y = numext::abs(v) + T(0.386595);
     q = x*x + y * (T(0.196)*y - T(0.25472)*x);
@@ -208,14 +224,14 @@ T RandomToTypeNormal(uint64_t* state) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
-  return std::complex<float>(RandomToTypeNormal<float>(state),
-                             RandomToTypeNormal<float>(state));
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
+  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
+                             RandomToTypeNormal<float>(state, stream));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
-  return std::complex<double>(RandomToTypeNormal<double>(state),
-                              RandomToTypeNormal<double>(state));
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
+  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
+                              RandomToTypeNormal<double>(state, stream));
 }
 
 
@@ -226,17 +242,38 @@ template <typename T> class NormalRandomGenerator {
   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
+    #ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefor, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Therefore, the thread Id injection is not available at this stage. However when the operator()
+    //is called the thread ID will be avilable. So inside the opeator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    //to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once =false;
+   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
       const NormalRandomGenerator& other) {
     m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once=other.m_exec_once;
+#endif
   }
 
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    uint64_t local_state = m_state + i;
-    T result = RandomToTypeNormal<T>(&local_state);
-    m_state = local_state;
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
+    T result = RandomToTypeNormal<T>(&m_state, i);
     return result;
   }
 
@@ -244,16 +281,25 @@ template <typename T> class NormalRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    uint64_t local_state = m_state + i;
+    #ifdef EIGEN_USE_SYCL
+    if(!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once =true;
+    }
+    #endif
+    EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeNormal<T>(&local_state);
+      values[j] = RandomToTypeNormal<T>(&m_state, i);
     }
-    m_state = local_state;
     return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
+   #ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+  #endif
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 41d0d0022..583f46256 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,8 +11,20 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+
 namespace Eigen {
 
+
 /** \class TensorReduction
   * \ingroup CXX11_Tensor_Module
   *
@@ -32,6 +44,7 @@ namespace internal {
   typedef typename XprType::Nested Nested;
   static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 
   template <class T> struct MakePointer {
     // Intermediate typedef to workaround MSVC issue.
@@ -152,7 +165,9 @@ struct GenericDimReducer<-1, Self, Op> {
   }
 };
 
-template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
+          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
+                                   !Self::ReducerTraits::IsExactlyAssociative)>
 struct InnerMostDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
     typename Self::CoeffReturnType accum = reducer.initialize();
@@ -164,23 +179,100 @@ struct InnerMostDimReducer {
 };
 
 template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true> {
+struct InnerMostDimReducer<Self, Op, true, false> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
-    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
+    typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
     }
     typename Self::CoeffReturnType accum = reducer.initialize();
     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
     }
-    return reducer.finalizeBoth(accum, p);
+    return reducer.finalizeBoth(accum, paccum);
   }
 };
 
-template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+#if !defined(EIGEN_HIPCC) 
+static const int kLeafSize = 1024;
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, false, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+  reduce(const Self& self, typename Self::Index firstIndex,
+         typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > kLeafSize) {
+      const typename Self::Index half = numValuesToReduce / 2;
+      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
+      reducer.reduce(
+          reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
+          &accum);
+    } else {
+      for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+      }
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+  reduce(const Self& self, typename Self::Index firstIndex,
+         typename Self::Index numValuesToReduce, Op& reducer) {
+    const typename Self::Index packetSize =
+        internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > packetSize * kLeafSize) {
+      // Make sure the split point is aligned on a packet boundary.
+      const typename Self::Index split =
+          packetSize *
+          divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
+                packetSize);
+      const typename Self::Index num_left =
+          numext::mini(split - firstIndex, numValuesToReduce);
+      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
+      if (num_left < numValuesToReduce) {
+        reducer.reduce(
+            reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
+      }
+      return reducer.finalize(accum);
+    } else {
+      const typename Self::Index UnrollSize =
+          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
+      const typename Self::Index VectorizedSize =
+          (numValuesToReduce / packetSize) * packetSize;
+      typename Self::PacketReturnType paccum =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum2 =
+          reducer.template initializePacket<typename Self::PacketReturnType>();
+      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+        reducer.reducePacket(
+            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+            &paccum2);
+      }
+      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+                                 firstIndex + j), &paccum);
+      }
+      reducer.reducePacket(paccum2, &paccum);
+      for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
+           ++j) {
+        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+      }
+      return reducer.finalizeBoth(accum, paccum);
+    }
+  }
+};
+#endif
+ 
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct InnerMostDimPreserver {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
     eigen_assert(false && "should never be called");
@@ -215,11 +307,11 @@ struct InnerMostDimPreserver<-1, Self, Op, true> {
 };
 
 // Default full reducer
-template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct FullReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
@@ -229,7 +321,7 @@ struct FullReducer {
 #ifdef EIGEN_USE_THREADS
 // Multithreaded full reducers
 template <typename Self, typename Op,
-          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct FullReducerShard {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                   typename Self::Index numValuesToReduce, Op& reducer,
@@ -242,8 +334,8 @@ struct FullReducerShard {
 // Multithreaded full reducer
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
-  static const bool HasOptimizedImplementation = !Op::IsStateful;
-  static const int PacketSize =
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
+  static const Index PacketSize =
       unpacket_traits<typename Self::PacketReturnType>::size;
 
   // launch one reducer per thread and accumulate the result.
@@ -320,29 +412,58 @@ struct OuterReducer {
   }
 };
 
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+  static const bool HasOptimizedImplementation = false;
 
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+#endif
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename S, typename R, typename I>
-__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-template <int NPT, typename S, typename R, typename I>
-__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#if defined(EIGEN_HAS_GPU_FP16)
+template <typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
 
 #endif
 
-template <int NPT, typename S, typename R, typename I>
-__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
-template <int NPT, typename S, typename R, typename I>
-__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if defined(EIGEN_USE_SYCL)
+  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
+#else
+  typedef typename remove_const<CoeffReturnType>::type type;
+#endif
+};
+
 }  // end namespace internal
 
 
@@ -376,11 +497,15 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
     const Op m_reducer;
 };
 
+template<typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
 
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
+  typedef Dims ReducedDims;
   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
   typedef ArgType ChildType;
@@ -390,26 +515,42 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   static const int NumOutputDims = NumInputDims - NumReducedDims;
   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+    // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
 
   enum {
     IsAligned = false,
-    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
+    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
+  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -434,11 +575,13 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         m_outputStrides[0] = 1;
         for (int i = 1; i < NumOutputDims; ++i) {
           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides.back() = 1;
+        m_outputStrides[NumOutputDims - 1] = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       }
     }
@@ -466,6 +609,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           ++reduceIndex;
         } else {
           m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
           ++outputIndex;
         }
       }
@@ -475,13 +619,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     if (NumOutputDims == 0) {
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
+
+    m_numValuesToReduce =
+        NumOutputDims == 0
+            ? internal::array_prod(input_dims)
+            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                  ? m_preservedStrides[0]
+                  : m_preservedStrides[NumOutputDims - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-
+  EIGEN_STRONG_INLINE
+  bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
     // Use the FullReducer if possible.
     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
@@ -489,7 +639,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
          !RunningOnGPU))) {
       bool need_assign = false;
       if (!data) {
-        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
+        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
         data = m_result;
         need_assign = true;
       }
@@ -497,20 +647,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
       return need_assign;
     }
-    else if(RunningOnSycl){
-      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-      if (!data) {
-        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-        m_result = data;
-      }
-      Op reducer(m_reducer);
-      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-      return (m_result != NULL);
-    }
 
     // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -524,8 +663,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
             m_result = data;
           }
           else {
@@ -533,9 +672,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           }
         }
         Op reducer(m_reducer);
+        // For SYCL this if always return false
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
-            m_device.deallocate(m_result);
+            m_device.deallocate_temp(m_result);
             m_result = NULL;
           }
           return true;
@@ -557,8 +697,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
-            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
             m_result = data;
           }
           else {
@@ -566,9 +706,10 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           }
         }
         Op reducer(m_reducer);
+        // For SYCL this if always return false
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
-            m_device.deallocate(m_result);
+            m_device.deallocate_temp(m_result);
             m_result = NULL;
           }
           return true;
@@ -576,21 +717,54 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
           return (m_result != NULL);
         }
       }
+      #if defined(EIGEN_USE_SYCL)
+      // If there is no Optimised version for SYCL, the reduction expression 
+      // must break into two subexpression and use the SYCL generic Reducer on the device.
+      if(RunningOnSycl) {
+         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+         if (!data) {
+           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+           m_result = data;
+         }
+         Op reducer(m_reducer);
+         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+         return (m_result != NULL);
+       }
+      #endif
     }
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE
+      void
+      evalSubExprsIfNeededAsync(EvaluatorPointerType data,
+                                EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
+      done(evalSubExprsIfNeededCommon(data));
+    });
+  }
+#endif
+
+  EIGEN_STRONG_INLINE
+  bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return evalSubExprsIfNeededCommon(data);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
     if (m_result) {
-      m_device.deallocate(m_result);
+      m_device.deallocate_temp(m_result);
       m_result = NULL;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
+    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
       return *(m_result + index);
     }
     Op reducer(m_reducer);
@@ -662,37 +836,52 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
     }
   }
 
-  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  /// added for sycl in order to construct the buffer from the sycl device
-  const Device& device() const{return m_device;}
-  /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
-  const Dims& xprDims() const {return m_xpr_dims;}
-
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_result.bind(cgh);
+  }
+#endif
 
   private:
   template <int, typename, typename> friend struct internal::GenericDimReducer;
-  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
+  template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
 #ifdef EIGEN_USE_THREADS
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
-#ifdef EIGEN_HAS_CUDA_FP16
-  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_HAS_GPU_FP16)
+  template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
+  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
 #endif
-  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
-  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
+#if defined(EIGEN_USE_SYCL)
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+ template <typename, typename, typename> friend struct internal::GenericReducer;
+#endif
+
+
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -741,10 +930,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   Dimensions m_dimensions;
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
-  // Subset of strides of the input tensor for the non-reduced dimensions.
-  // Indexed by output dimensions.
-  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
   array<Index, NumPreservedStrides> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, NumOutputDims> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
 
   // Subset of strides of the input tensor for the reduced dimensions.
   // Indexed by reduced dimensions.
@@ -760,7 +951,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   Op m_reducer;
 
   // For full reductions
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
   static const bool RunningOnSycl = false;
 #elif defined(EIGEN_USE_SYCL)
@@ -770,10 +961,36 @@ static const bool RunningOnGPU = false;
   static const bool RunningOnGPU = false;
   static const bool RunningOnSycl = false;
 #endif
-  typename MakePointer_<CoeffReturnType>::Type m_result;
+  EvaluatorPointerType m_result;
 
-  const Device& m_device;
-  const Dims& m_xpr_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the coeff function should be overridden by for SYCL kernel
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+    return *(this->data() + index);
+  }
+  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+  //Therefore the packet function should be overridden by for SYCL kernel
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+  }
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 65638b6a8..68780cd3c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -1,750 +1,6 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
-
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple cuda thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another cuda thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if __CUDA_ARCH__ >= 300
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    assert(0 && "Wordsize not supported");
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
-  return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
-  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
-  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-}
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
 #endif
 
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if __CUDA_ARCH__ >= 300
-  atomicAdd(output, accum);
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if __CUDA_ARCH__ >= 300
-  // Initialize the output value
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-  }
-  else {
-    if (threadIdx.x == 0) {
-      unsigned int block = atomicCAS(semaphore, 0u, 1u);
-      if (block == 0) {
-        // We're the first block to run, initialize the output value
-        atomicExchCustom(output, reducer.initialize());
-        __threadfence();
-        atomicExch(semaphore, 2u);
-      }
-      else {
-        // Wait for the first block to initialize the output value.
-        // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val;
-        do {
-          val = atomicCAS(semaphore, 2u, 2u);
-        }
-        while (val < 2u);
-      }
-    }
-  }
-
-  __syncthreads();
-
-  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-  for (Index i = 0; i < max_iter; i+=BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-
-  if (gridDim.x > 1 && threadIdx.x == 0) {
-    // Let the last block reset the semaphore
-    atomicInc(semaphore, gridDim.x + 1);
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
-  eigen_assert(blockDim.x == 1);
-  eigen_assert(gridDim.x == 1);
-  if (num_coeffs % 2 != 0) {
-    half last = input.m_impl.coeff(num_coeffs-1);
-    *scratch = __halves2half2(last, reducer.initialize());
-  } else {
-    *scratch = reducer.template initializePacket<half2>();
-  }
-}
-
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index num_packets = num_coeffs / 2;
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    ((half2*)output)[i] = reducer.template initializePacket<half2>();
-  }
-
-  if (thread_id == 0 && num_coeffs % 2 != 0) {
-    output[num_coeffs-1] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, half2* scratch) {
-  eigen_assert(NumPerThread % 2 == 0);
-
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    if (num_coeffs % 2 != 0) {
-      half last = input.m_impl.coeff(num_coeffs-1);
-      *scratch = __halves2half2(last, reducer.initialize());
-    } else {
-      *scratch = reducer.template initializePacket<half2>();
-    }
-    __syncthreads();
-  }
-
-  half2 accum = reducer.template initializePacket<half2>();
-  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + 2*i;
-    eigen_assert(index + 1 < num_coeffs);
-    half2 val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  __syncthreads();
-
-  if (gridDim.x == 1 && first_index == 0) {
-    half tmp = __low2half(*scratch);
-    reducer.reduce(__high2half(*scratch), &tmp);
-    *output = tmp;
-  }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  half tmp = __low2half(*scratch);
-  reducer.reduce(__high2half(*scratch), &tmp);
-  *output = tmp;
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on doubles, floats and half floats");
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
-    Self, Op, OutputType, PacketAccess,
-    typename internal::enable_if<
-      internal::is_same<float, OutputType>::value ||
-      internal::is_same<double, OutputType>::value,
-    void>::type> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-    typedef typename Self::CoeffReturnType Scalar;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    unsigned int* semaphore = NULL;
-    if (num_blocks > 1) {
-      semaphore = device.semaphore();
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
-    }
-
-    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
-                         1, 1, 0, device, reducer, output, scratch);
-    }
-  }
-};
-#endif
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple cases
-  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-#if __CUDA_ARCH__ >= 300
-  typedef typename Self::CoeffReturnType Type;
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
-
-      Type reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-      }
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-  }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
-}
-
-#ifdef EIGEN_HAS_CUDA_FP16
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                              half* output) {
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-  eigen_assert(unroll_times % 2 == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
-  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    Index i = 2*thread_id;
-    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
-      half* loc = output + i;
-      *((half2*)loc) = reducer.template initializePacket<half2>();
-    }
-    if (i < num_preserved_coeffs) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = 2 * (i / input_col_blocks);
-
-    if (row + 1 < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
-
-      half2 reduced_val1 = reducer.template initializePacket<half2>();
-      half2 reduced_val2 = reducer.template initializePacket<half2>();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
-        if (last_col >= num_coeffs_to_reduce) {
-          Index col = col_begin + blockDim.x * j;
-          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
-            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val1, &reduced_val1);
-            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          if (col < num_coeffs_to_reduce) {
-            // Peel;
-            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            const half2 val1 = __halves2half2(last1, reducer.initialize());
-            reducer.reducePacket(val1, &reduced_val1);
-            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
-            const half2 val2 = __halves2half2(last2, reducer.initialize());
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k) * 2;
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
-        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
-      }
-
-      half val1 =  __low2half(reduced_val1);
-      reducer.reduce(__high2half(reduced_val1), &val1);
-      half val2 =  __low2half(reduced_val2);
-      reducer.reduce(__high2half(reduced_val2), &val2);
-      half2 val = __halves2half2(val1, val2);
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
-      }
-    }
-  }
-}
-
-#endif
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
-    return true;
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
-  Self, Op, OutputType, PacketAccess,
-  typename internal::enable_if<
-    internal::is_same<float, OutputType>::value ||
-    internal::is_same<double, OutputType>::value,
-  void>::type> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#ifdef EIGEN_HAS_CUDA_FP16
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
-  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should not be called since there is no packet accessor");
-    return true;
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    if (num_preserved_vals % 2 != 0) {
-      // Not supported yet, revert to the slower code path
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/128;
-    const int num_per_thread = /*128*/64;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-#endif
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-#ifdef EIGEN_HAS_CUDA_FP16
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif
-
-  template <typename OutputType>
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return true;
-    }
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 128) {
-      return true;
-    }
-
-    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                     typename Self::CoeffReturnType* output) {
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
-    return true;
-  }
-
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumCudaMultiProcessors() *
-                             device.maxCudaThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#endif
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#include "TensorReductionGpu.h"
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 000000000..db4e8d866
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -0,0 +1,966 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <typename R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
+  half2* houtput=reinterpret_cast<half2*>(output);
+  half2* haccum=reinterpret_cast<half2*>(&accum);
+  for(int i=0;i<4;++i){
+    atomicReduce(houtput+i,*(haccum+i),reducer);
+  }
+}
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val 
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+    // and list the float and int versions of __shfl_down as the candidate functions. 
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                                      packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch =
+          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    *scratch = reducer.template initializePacket<packet_type>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets =
+      num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, packet_traits<Eigen::half>::type* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index =
+      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        *scratch = reducer.template initializePacket<PacketType>();
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.m_impl.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        *scratch = reducer.template initializePacket<PacketType>();
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width,
+                          NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+  #else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half2* pscratch = reinterpret_cast<half2*>(scratch);
+  half tmp = __float2half(0.f);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+    reducer.reduce(__low2half(*pscratch), &tmp);
+    reducer.reduce(__high2half(*pscratch), &tmp);
+    pscratch++;
+  }
+  *output = tmp;
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename packet_traits<Eigen::half>::type PacketType;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
+    // half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val 
+       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+       // and list the float and int versions of __shfl_down as the candidate functions. 
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs;
+         i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin =
+          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col =
+            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce;
+               col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(
+                row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>(
+                (row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
+                                     1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 =
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 =
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     (row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
+          hr2[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+      #endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+    #if !defined(EIGEN_HIPCC)
+    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+    //          (in the cxx11_tensor_reduction_gpu test)
+    //
+    // terminate called after throwing an instance of 'std::runtime_error'
+    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+    //
+    // don't know why this happens (and why is it a runtime error instead of a compile time error)
+    //
+    // this will be fixed by HIP PR#457
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 3daecb045..474eba06f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -11,232 +11,572 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
- * TensorSyclPlaceHolderExpr.h
+ * TensorReductionSycl.h
  *
  * \brief:
- *  This is the specialisation of the placeholder expression based on the
- * operation type
+ *  This is the specialization of the reduction operation. Two phase reduction approach 
+ * is used since the GPU does not have Global Synchronization for global memory among 
+ * different work-group/thread block. To solve the problem, we need to create two kernels 
+ * to reduce the data, where the first kernel reduce the data locally and each local 
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
  *
-*****************************************************************/
+ *****************************************************************/
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-
 namespace Eigen {
+namespace TensorSycl {
 namespace internal {
 
-template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
-  do {
-          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
-            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
-                                    cl::sycl::range<1>{std::min(length, local)}};
-            /* Two accessors are used: one to the buffer that is being reduced,
-             * and a second to local memory, used to store intermediate data. */
-            auto aI =
-                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
-            auto aOut =
-                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
-            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
-                               cl::sycl::access::target::local>
-                scratch(cl::sycl::range<1>(local), h);
-
-            /* The parallel_for invocation chosen is the variant with an nd_item
-             * parameter, since the code requires barriers for correctness. */
-            h.parallel_for<KernelName>(
-                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
-                  size_t globalid = id.get_global(0);
-                  size_t localid = id.get_local(0);
-                  /* All threads collectively read from global memory into local.
-                   * The barrier ensures all threads' IO is resolved before
-                   * execution continues (strictly speaking, all threads within
-                   * a single work-group - there is no co-ordination between
-                   * work-groups, only work-items). */
-                  if (globalid < length) {
-                    scratch[localid] = aI[globalid];
-                  }
-                  id.barrier(cl::sycl::access::fence_space::local_space);
-
-                  /* Apply the reduction operation between the current local
-                   * id and the one on the other half of the vector. */
-                  if (globalid < length) {
-                    int min = (length < local) ? length : local;
-                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
-                      if (localid < offset) {
-                        scratch[localid] += scratch[localid + offset];
-                      }
-                      id.barrier(cl::sycl::access::fence_space::local_space);
-                    }
-                    /* The final result will be stored in local id 0. */
-                    if (localid == 0) {
-                      aI[id.get_group(0)] = scratch[localid];
-                      if((length<=local) && globalid ==0){
-                        aOut[globalid]=scratch[localid];
-                      }
-                    }
-                  }
-                });
-          };
-            dev.m_queue.submit(f);
-            dev.m_queue.throw_asynchronous();
-
-          /* At this point, you could queue::wait_and_throw() to ensure that
-           * errors are caught quickly. However, this would likely impact
-           * performance negatively. */
-          length = length / local;
-
-        } while (length > 1);
-
-
-
-}
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+  typedef Op type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
 
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &) {
+    return accumulator;
+  }
 };
 
-/// For now let's start with a full reducer
-/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
-/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
-/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
-// a leafNode.
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+                                                                           const Index &scale) {
+    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+    return quotient_op(accumulator, CoeffReturnType(scale));
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &scale) {
+    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+  }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+          Index local_range>
+struct SecondStepFullReducer {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+  typedef typename OpDef::type Op;
+  LocalAccessor scratch;
+  InputAccessor aI;
+  OutputAccessor outAcc;
+  Op op;
+  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    // Our empirical research shows that the best performance will be achieved
+    // when there is only one element per thread to reduce in the second step.
+    // in this step the second step reduction time is almost negligible.
+    // Hence, in the second step of reduction the input size is fixed to the
+    // local size, thus, there is only one element read per thread. The
+    // algorithm must be changed if the number of reduce per thread in the
+    // second step is greater than 1. Otherwise, the result will be wrong.
+    const Index localid = itemID.get_local_id(0);
+    auto aInPtr = aI.get_pointer() + localid;
+    auto aOutPtr = outAcc.get_pointer();
+    CoeffReturnType *scratchptr = scratch.get_pointer();
+    CoeffReturnType accumulator = *aInPtr;
+
+    scratchptr[localid] = op.finalize(accumulator);
+    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratchptr[localid + offset], &accumulator);
+        scratchptr[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) *aOutPtr = op.finalize(accumulator);
+  }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept 
+// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+      OpDef;
+
+  typedef typename OpDef::type Op;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::PacketReturnType PacketReturnType;
+  typedef
+      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+                                              PacketReturnType, CoeffReturnType>::type OutType;
+  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  LocalAccessor scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType final_output;
+  Index rng;
+  Op op;
+
+  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+                             Index rng_, OpType op_)
+      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    Index start = Evaluator::PacketSize * globalid;
+    // vectorizable parts
+    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+    for (Index i = start; i < VectorizedRange; i += step) {
+      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+    }
+    globalid += VectorizedRange;
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.template reducePacket<PacketReturnType>(
+          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+              evaluator.impl().coeff(i), op.initialize()),
+          &packetAccumulator);
+    }
+    scratch[localid] = packetAccumulator =
+        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+    // reduction parts // Local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] =
+          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+    }
+  }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) {
+    auto output_ptr = final_output.get_pointer();
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    // vectorizable parts
+    CoeffReturnType accumulator = op.initialize();
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.reduce(evaluator.impl().coeff(i), &accumulator);
+    }
+    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+    // reduction parts. the local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratch[localid + offset], &accumulator);
+        scratch[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  template <typename Scratch>
+  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+                       Index range_, Index num_values_to_reduce_)
+      : evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        functor(OpDef::get_op(functor_)),
+        range(range_),
+        num_values_to_reduce(num_values_to_reduce_) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) {
+    auto output_accessor_ptr = output_accessor.get_pointer();
+    /// const cast added as a naive solution to solve the qualifier drop error
+    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid < range) {
+      CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+          evaluator, evaluator.firstInput(globalid), functor, &accum);
+      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+    }
+  }
+
+ private:
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAcc;
+  ScratchAcc scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op op;
+  const Index preserve_elements_num_groups;
+  const Index reduce_elements_num_groups;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+      : scratch(scratch_),
+        evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        preserve_elements_num_groups(preserve_elements_num_groups_),
+        reduce_elements_num_groups(reduce_elements_num_groups_),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+                                                                 CoeffReturnType &accumulator) {
+    if (globalPId >= num_coeffs_to_preserve) {
+      return;
+    }
+    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
+    Index localOffset = globalRId;
+
+    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+    const Index per_thread_global_stride =
+        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+      localOffset += per_thread_local_stride;
+      global_offset += per_thread_global_stride;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index linearLocalThreadId = itemID.get_local_id(0);
+    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+                                                           : itemID.get_group(0) / reduce_elements_num_groups;
+    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+                                                           : itemID.get_group(0) % reduce_elements_num_groups;
+
+    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+    auto scratchPtr = scratch.get_pointer().get();
+    auto outPtr =
+        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+    CoeffReturnType accumulator = op.initialize();
+
+    element_wise_reduce(globalRId, globalPId, accumulator);
 
+    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+        accumulator;
+    if (rt == reduction_dim::inner_most) {
+      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    }
+
+    /* Apply the reduction operation between the current local
+     * id and the one on the other half of the vector. */
+    auto out_scratch_ptr =
+        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    if (rt == reduction_dim::inner_most) {
+      accumulator = *out_scratch_ptr;
+    }
+    // The Local LocalThreadSizeR is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+      if (rLocalThreadId < offset) {
+        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+        // The result has already been divided for mean reducer in the
+        // previous reduction so no need to divide furthermore
+        *out_scratch_ptr = op.finalize(accumulator);
+      }
+      /* All threads collectively read from global memory into local.
+       * The barrier ensures all threads' IO is resolved before
+       * execution continues (strictly speaking, all threads within
+       * a single work-group - there is no co-ordination between
+       * work-groups, only work-items). */
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+
+    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+      outPtr[globalPId] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAccessor;
+  InputAccessor input_accessor;
+  OutputAccessor output_accessor;
+  Op op;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+                                                                   OutputAccessor output_accessor_, OpType op_,
+                                                                   const Index num_coeffs_to_preserve_,
+                                                                   const Index num_coeffs_to_reduce_)
+      : input_accessor(input_accessor_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    const Index globalId = itemID.get_global_id(0);
+
+    if (globalId >= num_coeffs_to_preserve) return;
+
+    auto in_ptr = input_accessor.get_pointer() + globalId;
+
+    OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+      op.reduce(*in_ptr, &accumulator);
+      in_ptr += num_coeffs_to_preserve;
+    }
+    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+  }
+};  // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+  static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
   typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
-
-  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-    auto functors = TensorSycl::internal::extractFunctors(self.impl());
-    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
-    size_t inputSize =self.impl().dimensions().TotalSize();
-    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
-    size_t remaining = inputSize% red_factor;
-    if(rng ==0) {
-      red_factor=1;
-    };
-    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-    size_t GRange=std::max((size_t )1, rng);
-
-    // convert global range to power of 2 for redecution
-    GRange--;
-    GRange |= GRange >> 1;
-    GRange |= GRange >> 2;
-    GRange |= GRange >> 4;
-    GRange |= GRange >> 8;
-    GRange |= GRange >> 16;
-#if __x86_64__ || __ppc64__ || _WIN64
-    GRange |= GRange >> 32;
-#endif
-    GRange++;
-    size_t  outTileSize = tileSize;
-    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
-    if (GRange < outTileSize) outTileSize=GRange;
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
-    /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
-    /// recursively apply reduction on it in order to reduce the whole.
-    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-    Dims dims= self.xprDims();
-    Op functor = reducer;
-    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
-      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
-
-      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-        /// the device_evaluator is detectable and recognisable on the device.
-        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-        /// const cast added as a naive solution to solve the qualifier drop error
-        auto globalid=itemID.get_global_linear_id();
-
-        if(globalid<rng)
-          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
-        else
-          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
-
-        if(remaining!=0 && globalid==0 )
-          // this will add the rest of input buffer when the input size is not devidable to red_factor.
-          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
-      });
-    });
-  dev.m_queue.throw_asynchronous();
-
-/// This is used to recursively reduce the tmp value to an element of 1;
-  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+  typedef typename Self::Storage Storage;
+  typedef typename Self::Index Index;
+  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+      PannelParameters;
+
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+    // getPowerOfTwo makes sure local range is power of 2 and <=
+    // maxSyclThreadPerBlock this will help us to avoid extra check on the
+    // kernel
+    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+
+    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    // In this step, we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 64
+    // elemnts individually, we get better performance. However, this can change
+    // on different platforms. In this step we force the code not to be
+    // morthan step reduction: Our empirical research shows that for inner_most
+    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+    // > 1024 to achieve the best performance.
+    const Index reductionPerThread = 64;
+    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+    const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+    EIGEN_CONSTEXPR Index scratchSize =
+        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (rNumGroups > 1) {
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+
+      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+          SecondStepPartialReductionKernel;
+
+      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+          temp_accessor, output,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+          reducer, num_coeffs_to_preserve, rNumGroups);
+
+      self.device().deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+          num_coeffs_to_reduce);
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+    // In this step we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 512
+    // elemnts individually, we get better performance.
+    const Index reductionPerThread = 2048;
+    // const Index num_work_group =
+    Index reductionGroup = dev.getPowerOfTwo(
+        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+    const Index num_work_group = std::min(reductionGroup, local_range);
+    // 1
+    // ? local_range
+    // : 1);
+    const Index global_range = num_work_group * local_range;
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+                                                                      local_range, inputSize, reducer);
+
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, Index, local_range>
+          GenericRKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+          tmp_global_accessor, data,
+          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+          reducer);
+
+      dev.deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+                                                                      reducer);
+    }
+  }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
   }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
 
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
 };
 
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
 template <typename Self, typename Op>
-struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    typename Self::Index range, GRange, tileSize;
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
 
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  static const bool HasOptimizedImplementation = false;
-
-  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
-    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-    auto functors = TensorSycl::internal::extractFunctors(self.impl());
-
-    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-
-    size_t GRange=num_coeffs_to_preserve;
-    if (tileSize>GRange) tileSize=GRange;
-    else if(GRange>tileSize){
-      size_t xMode = GRange % tileSize;
-      if (xMode != 0) GRange += (tileSize - xMode);
-    }
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
-    /// recursively apply reduction on it in order to reduce the whole.
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-    Dims dims= self.xprDims();
-    Op functor = reducer;
-
-    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
-      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
-
-      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-        /// the device_evaluator is detectable and recognisable on the device.
-        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
-        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-        /// const cast added as a naive solution to solve the qualifier drop error
-        auto globalid=itemID.get_global_linear_id();
-        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
-          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
-          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
-          functor.finalize(accum);
-          output_accessor.get_pointer()[globalid]= accum;
-        }
-      });
-    });
-  dev.m_queue.throw_asynchronous();
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
     return false;
   }
 };
 
-}  // end namespace internal
+}  // namespace internal
 }  // namespace Eigen
 
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 99245f778..a27d3646d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -31,7 +31,7 @@ class TensorLazyBaseEvaluator {
   int refCount() const { return m_refcount; }
 
  private:
-  // No copy, no assigment;
+  // No copy, no assignment;
   TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
   TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
 
@@ -44,6 +44,9 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, t
  public:
   //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
   typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef  TensorEvaluator<Expr, Device> EvalType;
 
   TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
     m_dims = m_impl.dimensions();
@@ -79,6 +82,8 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimension
  public:
   typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
   typedef typename Base::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
   }
@@ -136,11 +141,17 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     enum {
       IsAligned = false,
       PacketAccess = false,
+      BlockAccess = false,
+      PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
     };
 
+    //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+    typedef internal::TensorBlockNotImplemented TensorBlock;
+    //===------------------------------------------------------------------===//
+
     EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
     }
 
@@ -360,26 +371,34 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
       : m_ref(m)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_ref.coeff(index);
@@ -389,7 +408,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
     return m_ref.coeffRef(index);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
 
  protected:
   TensorRef<Derived> m_ref;
@@ -411,10 +430,16 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 14e392e36..586ce68ab 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -31,6 +31,7 @@ struct traits<TensorReverseOp<ReverseDimensions,
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename ReverseDimensions, typename XprType>
@@ -53,15 +54,16 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
                                           XprType>, WriteAccessors>
 {
   public:
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
-                                                                    StorageKind;
-  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+    typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+                                                                      StorageKind;
+    typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
       const XprType& expr, const ReverseDimensions& reverse_dims)
       : m_xpr(expr), m_reverse_dims(reverse_dims) { }
 
@@ -72,24 +74,8 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
@@ -107,19 +93,38 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess       = NumDims > 0,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : m_impl(op.expression(), device), m_reverse(op.reverse())
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+      ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_reverse(op.reverse()),
+        m_device(device)
   {
     // Reversing a scalar isn't supported yet. It would be a no-op anyway.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -130,11 +135,13 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       m_strides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
@@ -142,11 +149,20 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -155,8 +171,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -169,8 +186,9 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
         inputIndex += index;
       }
     } else {
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
-        Index idx = index / m_strides[i];
+        Index idx = index / m_fastStrides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -202,6 +220,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     // local structure.
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
                                                             values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -209,6 +228,130 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     return rslt;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    // Block evaluation reads underlying memory in reverse order, and default
+    // cost model does not properly catch this in bytes stored/loaded.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+               target_size)
+        .addCostPerCoeff({0, 0, 24});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool /*root_of_expr_ast*/ = false) const {
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor =
+        static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    // Offset in the output block.
+    Index block_offset = 0;
+
+    // Offset in the input Tensor.
+    Index input_offset = reverseIndex(desc.offset());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      it[i].block_stride =
+          i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].size = it[effective_inner_dim].size * it[i].size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride ==
+                 (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].size;
+
+    // Prepare storage for the materialized reverse result.
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                      2 * TensorOpCost::MulCost<Index>() +
@@ -222,13 +365,42 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
            TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
 
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fastStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   ReverseDimensions m_reverse;
+  const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0),
+          count(0),
+          reverse(false),
+          block_stride(0),
+          block_span(0),
+          input_stride(0),
+          input_span(0) {}
+
+    Index size;
+    Index count;
+    bool reverse;
+    Index block_stride;
+    Index block_span;
+    Index input_stride;
+    Index input_span;
+  };
 };
 
 // Eval as lvalue
@@ -247,18 +419,23 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
@@ -275,11 +452,11 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     // This code is pilfered from TensorMorphing.h
     EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
-
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 8501466ce..beae854dd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -24,6 +24,7 @@ struct traits<TensorScanOp<Op, XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Op, typename XprType>
@@ -76,8 +77,299 @@ protected:
   const bool m_exclusive;
 };
 
-template <typename Self, typename Reducer, typename Device>
-struct ScanLauncher;
+
+namespace internal {
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
+                                      typename Self::CoeffReturnType* data) {
+  // Compute the scan along the axis, starting at the given offset
+  typename Self::CoeffReturnType accum = self.accumulator().initialize();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+}
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
+                                      typename Self::CoeffReturnType* data) {
+  using Scalar = typename Self::CoeffReturnType;
+  using Packet = typename Self::PacketReturnType;
+  // Compute the scan along the axis, starting at the calculated offset
+  Packet accum = self.accumulator().template initializePacket<Packet>();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  }
+}
+
+template <typename Self, bool Vectorize, bool Parallel>
+struct ReduceBlock {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Specialization for vectorized reduction.
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index idx2 = 0;
+    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
+      // Calculate the starting offset for the packet scan
+      Index offset = idx1 + idx2;
+      ReducePacket(self, offset, data);
+    }
+    for (; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Single-threaded CPU implementation of scan
+template <typename Self, typename Reducer, typename Device,
+          bool Vectorize =
+              (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
+               internal::reducer_traits<Reducer, Device>::PacketAccess)>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and
+    // idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
+      block_reducer(self, idx1, data);
+    }
+  }
+};
+
+#ifdef EIGEN_USE_THREADS
+
+// Adjust block_size to avoid false sharing of cachelines among
+// threads. Currently set to twice the cache line size on Intel and ARM
+// processors.
+EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
+  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+  const Index items_per_cacheline =
+      numext::maxi<Index>(1, kBlockAlignment / item_size);
+  return items_per_cacheline * divup(block_size, items_per_cacheline);
+}
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index num_scalars = self.stride();
+    Index num_packets = 0;
+    if (self.stride() >= PacketSize) {
+      num_packets = self.stride() / PacketSize;
+      self.device().parallelFor(
+          num_packets,
+        TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
+                     16 * PacketSize * self.size(), true, PacketSize),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index packet = first; packet < last; ++packet) {
+            const Index idx2 = packet * PacketSize;
+            ReducePacket(self, idx1 + idx2, data);
+          }
+        });
+      num_scalars -= num_packets * PacketSize;
+    }
+    self.device().parallelFor(
+        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index scalar = first; scalar < last; ++scalar) {
+            const Index idx2 = num_packets * PacketSize + scalar;
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+                                      typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    self.device().parallelFor(
+        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) {
+          return AdjustBlockSize(sizeof(Scalar), blk_size);
+        },
+        [&](Index first, Index last) {
+          for (Index idx2 = first; idx2 < last; ++idx2) {
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+// Specialization for multi-threaded execution.
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index inner_block_size = self.stride() * self.size();
+    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
+
+    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
+        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
+      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
+      launcher(self, data);
+      return;
+    }
+
+    if (parallelize_by_outer_blocks) {
+      // Parallelize over outer blocks.
+      const Index num_outer_blocks = total_size / inner_block_size;
+      self.device().parallelFor(
+          num_outer_blocks,
+          TensorOpCost(inner_block_size, inner_block_size,
+                       16 * PacketSize * inner_block_size, Vectorize,
+                       PacketSize),
+          [=](Index blk_size) {
+            return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
+          },
+          [&](Index first, Index last) {
+            for (Index idx1 = first; idx1 < last; ++idx1) {
+              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
+              block_reducer(self, idx1 * inner_block_size, data);
+            }
+          });
+    } else {
+      // Parallelize over inner packets/scalars dimensions when the reduction
+      // axis is not an inner dimension.
+      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
+      for (Index idx1 = 0; idx1 < total_size;
+           idx1 += self.stride() * self.size()) {
+        block_reducer(self, idx1, data);
+      }
+    }
+  }
+};
+#endif  // EIGEN_USE_THREADS
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+
+}
+
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+     Index total_size = internal::array_prod(self.dimensions());
+     Index num_blocks = (total_size / self.size() + 63) / 64;
+     Index block_size = 64;
+
+     LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
+
+}  // namespace internal
 
 // Eval as rvalue
 template <typename Op, typename ArgType, typename Device>
@@ -85,30 +377,38 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
 
   typedef TensorScanOp<Op, ArgType> XprType;
   typedef typename XprType::Index Index;
+  typedef const ArgType ChildTypeNoConst;
+  typedef const ArgType ChildType;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
     BlockAccess = false,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = true
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device),
         m_device(device),
         m_exclusive(op.exclusive()),
         m_accumulator(op.accumulator()),
         m_size(m_impl.dimensions()[op.axis()]),
-        m_stride(1),
+        m_stride(1), m_consume_dim(op.axis()),
         m_output(NULL) {
 
     // Accumulating a scalar isn't supported.
@@ -122,7 +422,11 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
         m_stride = m_stride * dims[i];
       }
     } else {
-      for (int i = NumDims - 1; i > op.axis(); --i) {
+      // dims can only be indexed through unsigned integers,
+      // so let's use an unsigned type to let the compiler knows.
+      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
+      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
+      for (unsigned int i = NumDims - 1; i > axis; --i) {
         m_stride = m_stride * dims[i];
       }
     }
@@ -136,6 +440,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return m_stride;
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
+    return m_consume_dim;
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
     return m_size;
   }
@@ -156,16 +464,16 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return m_device;
   }
 
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    ScanLauncher<Self, Op, Device> launcher;
+    internal::ScanLauncher<Self, Op, Device> launcher;
     if (data) {
       launcher(*this, data);
       return false;
     }
 
     const Index total_size = internal::array_prod(dimensions());
-    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
     launcher(*this, m_output);
     return true;
   }
@@ -175,7 +483,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
   {
     return m_output;
   }
@@ -189,98 +497,31 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_output != NULL) {
-      m_device.deallocate(m_output);
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output) {
+      m_device.deallocate_temp(m_output);
       m_output = NULL;
     }
     m_impl.cleanup();
   }
 
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+    m_output.bind(cgh);
+  }
+#endif
 protected:
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device& m_device;
+  const Device EIGEN_DEVICE_REF m_device;
   const bool m_exclusive;
   Op m_accumulator;
   const Index m_size;
   Index m_stride;
-  CoeffReturnType* m_output;
-};
-
-// CPU implementation of scan
-// TODO(ibab) This single-threaded implementation should be parallelized,
-// at least by running multiple scans at the same time.
-template <typename Self, typename Reducer, typename Device>
-struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType *data) {
-    Index total_size = internal::array_prod(self.dimensions());
-
-    // We fix the index along the scan axis to 0 and perform a
-    // scan per remaining entry. The iteration is split into two nested
-    // loops to avoid an integer division by keeping track of each idx1 and idx2.
-    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
-      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
-        // Calculate the starting offset for the scan
-        Index offset = idx1 + idx2;
-
-        // Compute the scan along the axis, starting at the calculated offset
-        typename Self::CoeffReturnType accum = self.accumulator().initialize();
-        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-          Index curr = offset + idx3 * self.stride();
-
-          if (self.exclusive()) {
-            data[curr] = self.accumulator().finalize(accum);
-            self.accumulator().reduce(self.inner().coeff(curr), &accum);
-          } else {
-            self.accumulator().reduce(self.inner().coeff(curr), &accum);
-            data[curr] = self.accumulator().finalize(accum);
-          }
-        }
-      }
-    }
-  }
-};
-
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-
-// GPU implementation of scan
-// TODO(ibab) This placeholder implementation performs multiple scans in
-// parallel, but it would be better to use a parallel scan algorithm and
-// optimize memory access.
-template <typename Self, typename Reducer>
-__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
-  // Compute offset as in the CPU version
-  Index val = threadIdx.x + blockIdx.x * blockDim.x;
-  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
-
-  if (offset + (self.size() - 1) * self.stride() < total_size) {
-    // Compute the scan along the axis, starting at the calculated offset
-    typename Self::CoeffReturnType accum = self.accumulator().initialize();
-    for (Index idx = 0; idx < self.size(); idx++) {
-      Index curr = offset + idx * self.stride();
-      if (self.exclusive()) {
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-      } else {
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
-      }
-    }
-  }
-  __syncthreads();
-
-}
-
-template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, GpuDevice> {
-  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
-     Index total_size = internal::array_prod(self.dimensions());
-     Index num_blocks = (total_size / self.size() + 63) / 64;
-     Index block_size = 64;
-     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
-  }
+  Index m_consume_dim;
+  EvaluatorPointerType m_output;
 };
-#endif  // EIGEN_USE_GPU && __CUDACC__
 
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 000000000..7f68ecb6a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,513 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ *  Tensor Scan Sycl implement the extend  version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+  // must be power of 2
+  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+  const index_t total_size;
+  const index_t non_scan_size;
+  const index_t scan_size;
+  const index_t non_scan_stride;
+  const index_t scan_stride;
+  const index_t panel_threads;
+  const index_t group_threads;
+  const index_t block_threads;
+  const index_t elements_per_group;
+  const index_t elements_per_block;
+  const index_t loop_range;
+
+  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+      : total_size(total_size_),
+        non_scan_size(non_scan_size_),
+        scan_size(scan_size_),
+        non_scan_stride(non_scan_stride_),
+        scan_stride(scan_stride_),
+        panel_threads(panel_threads_),
+        group_threads(group_threads_),
+        block_threads(block_threads_),
+        elements_per_group(elements_per_group_),
+        elements_per_block(elements_per_block_),
+        loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+          scan_step stp>
+struct ScanKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+  LocalAccessor scratch;
+  Evaluator dev_eval;
+  OutAccessor out_accessor;
+  OutAccessor temp_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  const bool inclusive;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
+                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
+                                                          const bool inclusive_)
+      : scratch(scratch_),
+        dev_eval(dev_eval_),
+        out_accessor(out_accessor_),
+        temp_accessor(temp_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_),
+        inclusive(inclusive_) {}
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt.coeff(global_id);
+  }
+
+  template <scan_step sst = stp, typename Input>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+      EIGEN_STRONG_INLINE
+      read(const Input &inpt, Index global_id) {
+    return inpt[global_id];
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+    inclusive_op();
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  first_step_inclusive_Operation(InclusiveOp) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto out_ptr = out_accessor.get_pointer();
+    auto tmp_ptr = temp_accessor.get_pointer();
+    auto scratch_ptr = scratch.get_pointer().get();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+      // we put one element per packet in scratch_mem
+      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+      CoeffReturnType inclusive_scan;
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+                           (global_id < scanParameters.total_size))
+                              ? read(dev_eval, global_id)
+                              : accumulator.initialize();
+        next_elements += scanParameters.scan_stride;
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+        }
+      });
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        Index private_offset = 1;
+        // build sum in place up the tree
+        EIGEN_UNROLL_LOOP
+        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+          private_offset *= 2;
+        }
+        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+            private_scan[PacketSize - 1 + packetIndex];
+        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+        // traverse down tree & build scan
+        EIGEN_UNROLL_LOOP
+        for (Index d = 1; d < PacketSize; d *= 2) {
+          private_offset >>= 1;
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[ai] = private_scan[bi];
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+        }
+      }
+
+      Index offset = 1;
+      // build sum in place up the tree
+      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+        offset *= 2;
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // next step optimisation
+      if (local_id == 0) {
+        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+                                    scanParameters.non_scan_size +
+                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+                                block_id;
+          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+        }
+        // clear the last element
+        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+      }
+      // traverse down tree & build scan
+      for (Index d = 1; d < scratch_stride; d *= 2) {
+        offset >>= 1;
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch_ptr[ai], &accum);
+          accumulator.reduce(scratch_ptr[bi], &accum);
+          scratch_ptr[ai] = scratch_ptr[bi];
+          scratch_ptr[bi] = accumulator.finalize(accum);
+        }
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        EIGEN_UNROLL_LOOP
+        for (Index i = 0; i < PacketSize; i++) {
+          CoeffReturnType accum = private_scan[packetIndex + i];
+          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          private_scan[packetIndex + i] = accumulator.finalize(accum);
+        }
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+          private_scan[0] = accumulator.finalize(inclusive_scan);
+        }
+      });
+      next_elements = 0;
+      // right the first set of private param
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+          out_ptr[global_id] = private_scan[private_id];
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }  // end for loop
+  }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  InAccessor in_accessor;
+  OutAccessor out_accessor;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+                                                                    OutAccessor out_accessor_,
+                                                                    const ScanParameters<Index> scanParameters_,
+                                                                    Op accumulator_)
+      : in_accessor(in_accessor_),
+        out_accessor(out_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+    auto in_ptr = in_accessor.get_pointer();
+    auto out_ptr = out_accessor.get_pointer();
+
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+      CoeffReturnType adjust_val = in_ptr[in_id];
+
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          CoeffReturnType accum = adjust_val;
+          accumulator.reduce(out_ptr[global_id], &accum);
+          out_ptr[global_id] = accumulator.finalize(accum);
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }
+  }
+};
+
+template <typename Index>
+struct ScanInfo {
+  const Index &total_size;
+  const Index &scan_size;
+  const Index &panel_size;
+  const Index &non_scan_size;
+  const Index &scan_stride;
+  const Index &non_scan_stride;
+
+  Index max_elements_per_block;
+  Index block_size;
+  Index panel_threads;
+  Index group_threads;
+  Index block_threads;
+  Index elements_per_group;
+  Index elements_per_block;
+  Index loop_range;
+  Index global_range;
+  Index local_range;
+  const Eigen::SyclDevice &dev;
+  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+                               const Eigen::SyclDevice &dev_)
+      : total_size(total_size_),
+        scan_size(scan_size_),
+        panel_size(panel_size_),
+        non_scan_size(non_scan_size_),
+        scan_stride(scan_stride_),
+        non_scan_stride(non_scan_stride_),
+        dev(dev_) {
+    // must be power of 2
+    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+    elements_per_group =
+        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+    const Index elements_per_panel = elements_per_group * non_scan_size;
+    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+    block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+    const Index max_threads = panel_threads * panel_size;
+#endif
+    global_range = roundUp(max_threads, local_range);
+    loop_range = Index(
+        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+  }
+  inline ScanParameters<Index> get_scan_parameter() {
+    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+  }
+  inline cl::sycl::nd_range<1> get_thread_range() {
+    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+  }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+                                                           Reducer &accumulator, const Index total_size,
+                                                           const Index scan_size, const Index panel_size,
+                                                           const Index non_scan_size, const Index scan_stride,
+                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+        AdjustFuctor;
+    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+                                                                      scan_info.max_elements_per_block,
+                                                                      scan_info.get_scan_parameter(), accumulator);
+  }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+                                             const Index total_size, const Index scan_size, const Index panel_size,
+                                             const Index non_scan_size, const Index scan_stride,
+                                             const Index non_scan_stride, const bool inclusive,
+                                             const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+    CoeffReturnType *temp_pointer =
+        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+        scan_info.get_scan_parameter(), accumulator, inclusive);
+
+    if (scan_info.block_size > 1) {
+      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+          non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+          non_scan_stride, dev);
+    }
+    dev.deallocate_temp(temp_pointer);
+  }
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+namespace internal {
+template <typename Self, typename Reducer, bool vectorize>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
+  typedef typename Self::Index Index;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  void operator()(Self &self, EvaluatorPointerType data) {
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index scan_size = self.size();
+    const Index scan_stride = self.stride();
+    // this is the scan op (can be sum or ...)
+    auto accumulator = self.accumulator();
+    auto inclusive = !self.exclusive();
+    auto consume_dim = self.consume_dim();
+    auto dev = self.device();
+
+    auto dims = self.inner().dimensions();
+
+    Index non_scan_size = 1;
+    Index panel_size = 1;
+    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < consume_dim; i++) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+        panel_size *= dims[i];
+      }
+    } else {
+      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim - 1; i >= 0; i--) {
+        panel_size *= dims[i];
+      }
+    }
+    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+    auto eval_impl = self.inner();
+    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+        inclusive, dev);
+  }
+};
+} // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 113c060e3..e5e5efdee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -31,6 +31,7 @@ struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Shuffle, typename XprType>
@@ -53,15 +54,16 @@ template<typename Shuffle, typename XprType>
 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+    typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
-      : m_xpr(expr), m_shuffle(shuffle) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+      : m_xpr(expr), m_shuffle(shfl) {}
 
     EIGEN_DEVICE_FUNC
     const Shuffle& shufflePermutation() const { return m_shuffle; }
@@ -70,24 +72,8 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
 
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
 
   protected:
     typename XprType::Nested m_xpr;
@@ -99,6 +85,7 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
 template<typename Shuffle, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@@ -106,100 +93,246 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess       = false,  // to be implemented
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+                                                     Layout, Index>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device),
+        m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
+    m_is_identity = true;
     for (int i = 0; i < NumDims; ++i) {
+      m_shuffle[i] = static_cast<int>(shuffle[i]);
       m_dimensions[i] = input_dims[shuffle[i]];
+      m_inverseShuffle[shuffle[i]] = i;
+      if (m_is_identity && shuffle[i] != i) {
+        m_is_identity = false;
+      }
     }
 
-    array<Index, NumDims> inputStrides;
-
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputStrides[0] = 1;
+      m_unshuffledInputStrides[0] = 1;
       m_outputStrides[0] = 1;
+
       for (int i = 1; i < NumDims; ++i) {
-        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     } else {
-      inputStrides[NumDims - 1] = 1;
+      m_unshuffledInputStrides[NumDims - 1] = 1;
       m_outputStrides[NumDims - 1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
-        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        m_unshuffledInputStrides[i] =
+            m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     }
 
     for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = inputStrides[shuffle[i]];
+      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+      EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return m_impl.coeff(srcCoeff(index));
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
   }
 
+  template <int LoadMode, typename Self, bool ImplPacketAccess>
+  struct PacketLoader {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = self.coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template<int LoadMode, typename Self>
+  struct PacketLoader<LoadMode, Self, true> {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    static PacketReturnType Run(const Self& self, Index index) {
+      if (self.m_is_identity) {
+        return self.m_impl.template packet<LoadMode>(index);
+      } else {
+        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = self.coeff(index + i);
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  };
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+  }
 
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const int inner_dim =
+        Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+    const size_t target_size = m_device.firstLevelCacheSize();
+    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
+
+    // Shuffled inner dimensions leads to a random memory access, which is not
+    // captured by default cost model bytes loaded/stored. We add this cost
+    // explicitly. The number of cycles picked based on the benchmarks.
+    // TODO(ezhulenev): This number was picked based on a very questionable
+    // benchmarks, add benchmarks that are representative of real workloads.
+    using BlockRequirements = internal::TensorBlockResourceRequirements;
+    if (inner_dim_shuffled) {
+      return BlockRequirements::uniform<Scalar>(target_size)
+          .addCostPerCoeff({0, 0, NumDims * 28});
+    } else {
+      return BlockRequirements::skewed<Scalar>(target_size);
     }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+          bool root_of_expr_ast = false) const {
+    assert(m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(
+            desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
+
+    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
+                         block_storage.data());
+
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    return block_storage.AsTensorMaterializedBlock();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+    const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
+                                NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                            2 * TensorOpCost::MulCost<Index>() +
                                            TensorOpCost::DivCost<Index>());
     return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+   // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
+      Index input_index,
+      const DSizes<Index, NumDims>& input_block_strides,
+      const DSizes<Index, NumDims>& output_block_strides,
+      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
+    Index output_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[0]];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index *
+          output_block_strides[m_inverseShuffle[NumDims - 1]];
+    }
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       return inputIndex + index * m_inputStrides[0];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
@@ -208,8 +341,15 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   }
 
   Dimensions m_dimensions;
+  bool m_is_identity;
+  array<int, NumDims> m_shuffle;
+  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
   array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_unshuffledInputStrides;
+
+  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
@@ -228,15 +368,24 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-    RawAccess = false
+    IsAligned         = false,
+    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    Layout            = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
   { }
 
@@ -252,10 +401,68 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+      const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+        TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const Scalar* block_buffer = block.data();
+
+    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+    // expression with coefficient and packet access as `src`.
+    void* mem = NULL;
+    if (block_buffer == NULL) {
+      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+      typedef internal::TensorBlockAssignment<
+          ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(
+              desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+              buf),
+          block.expr());
+
+      block_buffer = buf;
+    }
+
+    // Read from block.
+    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
+                         block_buffer);
+
+    // Write to the output buffer.
+    typename TensorBlockIO::Dimensions output_strides(
+        this->m_unshuffledInputStrides);
+    typename TensorBlockIO::Dimensions output_dimensions;
+    for (int i = 0; i < NumDims; ++i) {
+      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+    }
+    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
+                         this->srcCoeff(desc.offset()));
+
+    // Reorder dimensions according to the shuffle.
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+    }
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    // Deallocate temporary buffer used for the block materialization.
+    if (mem != NULL) this->m_device.deallocate(mem);
+  }
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 2854a4a17..5ff0880e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, typename Dimensions, int Options_> class TensorStorage;
+template<typename T, typename Dimensions, int Options> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, int Options_, typename FixedDimensions>
-class TensorStorage<T, FixedDimensions, Options_>
+template<typename T, typename FixedDimensions, int Options_>
+class TensorStorage
 {
  private:
   static const std::size_t Size = FixedDimensions::total_size;
@@ -45,8 +45,6 @@ class TensorStorage<T, FixedDimensions, Options_>
   static const std::size_t MinSize = max_n_1<Size>::size;
   EIGEN_ALIGN_MAX T m_data[MinSize];
 
-  FixedDimensions m_dimensions;
-
  public:
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE TensorStorage() {
@@ -57,16 +55,19 @@ class TensorStorage<T, FixedDimensions, Options_>
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T *data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
+  static EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
+  {
+    static const FixedDimensions* singleton_dimensions = new FixedDimensions();
+    return *singleton_dimensions;
+  }
 
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
+  EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
 };
 
-
 // pure dynamic
-template<typename T, int Options_, typename IndexType, int NumIndices_>
+template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
   public:
@@ -107,6 +108,20 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
       return *this;
     }
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage()
+    {
+      *this = std::move(other);
+    }
+    
+    EIGEN_DEVICE_FUNC Self& operator=(Self&& other)
+    {
+      numext::swap(m_data, other.m_data);
+      numext::swap(m_dimensions, other.m_dimensions);
+      return *this;
+    }
+#endif
+
     EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
     EIGEN_DEVICE_FUNC  void swap(Self& other)
     { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 6c35bfdb6..2f62a668f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -31,12 +31,13 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
 struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingOp<Strides, XprType>& type;
+  typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
 };
 
 template<typename Strides, typename XprType>
@@ -53,14 +54,15 @@ template<typename Strides, typename XprType>
 class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
 {
   public:
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+    typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
       : m_xpr(expr), m_dims(dims) {}
 
     EIGEN_DEVICE_FUNC
@@ -70,24 +72,7 @@ class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
     const typename internal::remove_all<typename XprType::Nested>::type&
     expression() const { return m_xpr; }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
-    {
-      typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
+    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
 
   protected:
     typename XprType::Nested m_xpr;
@@ -106,22 +91,30 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -146,13 +139,14 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
   }
 
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -170,6 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -181,6 +176,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * m_inputStrides[0];
       inputIndices[1] += indices[1] * m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -200,6 +196,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -222,13 +219,20 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
         TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
 
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -236,6 +240,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       }
       inputIndex += index * m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -252,7 +257,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
-
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -267,19 +271,20 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
   {
@@ -295,6 +300,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -306,6 +312,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * this->m_inputStrides[0];
       inputIndices[1] += indices[1] * this->m_inputStrides[0];
     } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -325,6 +332,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       internal::pstore<Scalar, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
deleted file mode 100644
index bb8800d45..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
-
-#ifdef EIGEN_USE_SYCL
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeGlobalPointer {
-  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
-};
-
-// global pointer to set different attribute state for a class
-template <class T>
-struct MakeLocalPointer {
-  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
-};
-
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
-  struct NoOP;
-
-template<bool IsConst, typename T> struct GetType{
-  typedef const T Type;
-};
-template<typename T> struct GetType<false, T>{
-  typedef T Type;
-};
-
-}
-}
-}
-
-// tuple construction
-#include "TensorSyclTuple.h"
-
-// counting number of leaf at compile time
-#include "TensorSyclLeafCount.h"
-
-// The index PlaceHolder takes the actual expression and replaces the actual
-// data on it with the place holder. It uses the same pre-order expression tree
-// traverse as the leaf count in order to give the right access number to each
-// node in the expression
-#include "TensorSyclPlaceHolderExpr.h"
-
-// creation of an accessor tuple from a tuple of SYCL buffers
-#include "TensorSyclExtractAccessor.h"
-
-// this is used to change the address space type in tensor map for GPU
-#include "TensorSyclConvertToDeviceExpression.h"
-
-// this is used to extract the functors
-#include "TensorSyclExtractFunctors.h"
-
-// this is used to create tensormap on the device
-// this is used to construct the expression on the device
-#include "TensorSyclExprConstructor.h"
-
-/// this is used for extracting tensor reduction
-#include "TensorReductionSycl.h"
-
-// kernel execution using fusion
-#include "TensorSyclRun.h"
-
-#endif  // end of EIGEN_USE_SYCL
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
deleted file mode 100644
index 8729c86ee..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ /dev/null
@@ -1,121 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- *  Conversion from host pointer to device pointer
- *  inside leaf nodes of the expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct ConvertToDeviceExpression
-/// \brief This struct is used to convert the MakePointer in the host expression
-/// to the MakeGlobalPointer for the device expression. For the leafNodes
-/// containing the pointer. This is due to the fact that the address space of
-/// the pointer T* is different on the host and the device.
-template <typename Expr>
-struct ConvertToDeviceExpression;
-
-template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
-struct NonOpConversion{
-  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
-};
-
-
-template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
-struct DeviceConvertor{
-  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
-};
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorMap
-#define TENSORMAPCONVERT(CVQual)\
-template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
-  typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
-};
-
-TENSORMAPCONVERT(const)
-TENSORMAPCONVERT()
-#undef TENSORMAPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
-#define CATEGORYCONVERT(CVQual)\
-template <template<class, class...> class Category, typename OP, typename... subExprs>\
-struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
-  typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
-};
-CATEGORYCONVERT(const)
-CATEGORYCONVERT()
-#undef CATEGORYCONVERT
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is  TensorCwiseSelectOp
-#define SELECTOPCONVERT(CVQual, Res)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
-: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
-SELECTOPCONVERT(const, true)
-SELECTOPCONVERT(, false)
-#undef SELECTOPCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is const AssingOP
-#define ASSIGNCONVERT(CVQual, Res)\
-template <typename LHSExpr, typename RHSExpr>\
-struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
-: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
-
-ASSIGNCONVERT(const, true)
-ASSIGNCONVERT(, false)
-#undef ASSIGNCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node
-/// type is either TensorForcedEvalOp or TensorEvalToOp
-#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
-: DeviceConvertor<ExprNode, Res, Expr>{};
-
-KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
-KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
-KERNELBROKERCONVERT(const, true, TensorEvalToOp)
-KERNELBROKERCONVERT(, false, TensorEvalToOp)
-#undef KERNELBROKERCONVERT
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
-#define KERNELBROKERCONVERTREDUCTION(CVQual)\
-template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
-  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
-};
-
-KERNELBROKERCONVERTREDUCTION(const)
-KERNELBROKERCONVERTREDUCTION()
-#undef KERNELBROKERCONVERTREDUCTION
-
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX1
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
deleted file mode 100644
index 7ed3a3a56..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ /dev/null
@@ -1,239 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExprConstructor.h
- *
- * \brief:
- *  This file re-create an expression on the SYCL device in order
- *  to use the original tensor evaluator.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// this class is used by EvalToOp in order to create an lhs expression which is
-/// a pointer from an accessor on device-only buffer
-template <typename PtrType, size_t N, typename... Params>
-struct EvalToLHSConstructor {
-  PtrType expr;
-  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
-};
-
-/// \struct ExprConstructor is used to reconstruct the expression on the device and
-/// recreate the expression with MakeGlobalPointer containing the device address
-/// space for the TensorMap pointers used in eval function.
-/// It receives the original expression type, the functor of the node, the tuple
-/// of accessors, and the device expression type to re-instantiate the
-/// expression tree for the device
-template <typename OrigExpr, typename IndexExpr, typename... Params>
-struct ExprConstructor;
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAP(CVQual)\
-template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
-  typedef  CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>  Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
-};
-
-TENSORMAP(const)
-TENSORMAP()
-#undef TENSORMAP
-
-#define UNARYCATEGORY(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
-  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
-  my_type rhsExpr;\
-  typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
-};
-
-UNARYCATEGORY(const)
-UNARYCATEGORY()
-#undef UNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorBinaryOp
-#define BINARYCATEGORY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
-typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>,  CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
-  typedef  ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
-  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
-  typedef  CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
-  my_left_type lhsExpr;\
-  my_right_type rhsExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
-};
-
-BINARYCATEGORY(const)
-BINARYCATEGORY()
-#undef BINARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseTernaryOp
-#define TERNARYCATEGORY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
-typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
-struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
-  typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
-  typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
-  typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
-  typedef  CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
-  my_arg1_type arg1Expr;\
-  my_arg2_type arg2Expr;\
-  my_arg3_type arg3Expr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
-  : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
-};
-
-TERNARYCATEGORY(const)
-TERNARYCATEGORY()
-#undef TERNARYCATEGORY
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorCwiseSelectOp
-#define SELECTOP(CVQual)\
-template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
-struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
-  typedef  ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
-  typedef  ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
-  typedef  ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
-  typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
-  my_if_type ifExpr;\
-  my_then_type thenExpr;\
-  my_else_type elseExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
-};
-
-SELECTOP(const)
-SELECTOP()
-#undef SELECTOP
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// const TensorAssignOp
-#define ASSIGN(CVQual)\
-template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
-struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
-  typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
-  typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
-  typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type>  Type;\
-  my_left_type lhsExpr;\
-  my_right_type rhsExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
- };
-
- ASSIGN(const)
- ASSIGN()
- #undef ASSIGN
-/// specialisation of the \ref ExprConstructor struct when the node type is
-///  TensorEvalToOp
-#define EVALTO(CVQual)\
-template <typename OrigExpr, typename Expr, typename... Params>\
-struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
-  typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
-  typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
-  typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
-  my_expr_type nestedExpression;\
-  EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
-};
-
-EVALTO(const)
-EVALTO()
-#undef EVALTO
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
-  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
-  static const size_t Res =X;
-};
-template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
-  static const size_t Res =Y;
-};
-
-/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXPR(CVQual)\
-template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
-  static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
-  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
-  NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
-};
-
-SYCLREDUCTIONEXPR(const)
-SYCLREDUCTIONEXPR()
-#undef SYCLREDUCTIONEXPR
-
-/// template deduction for \ref ExprConstructor struct
-template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
-auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
-    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
-  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
deleted file mode 100644
index b1da6858e..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclExtractAccessor.h
- *
- * \brief:
- * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
- * buffers as an input. Using pre-order tree traversal, ExtractAccessor
- * recursively calls itself for its children in the expression tree. The
- * leaf node in the PlaceHolder expression is nothing but a container preserving
- * the order of the actual data in the tuple of sycl buffer. By invoking the
- * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
- * buffer in the tuple of buffers. This accessor is then added as an Nth
- * element in the tuple of accessors. In this case we preserve the order of data
- * in the expression tree.
- *
- * This is the specialisation of extract accessor method for different operation
- * type in the PlaceHolder expression.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// \struct ExtractAccessor: Extract Accessor Class is used to extract the
-/// accessor from a buffer.
-/// Depending on the type of the leaf node we can get a read accessor or a
-/// read_write accessor
-template <typename Evaluator>
-struct ExtractAccessor;
-
-struct AccessorConstructor{
-  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
-  -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
-  return ExtractAccessor<Arg>::getTuple(cgh, eval);
-  }
-
-  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
-  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
-    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
-  }
-  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
-  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
-    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
-  }
-  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
-  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
-  typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
-    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
-    return AccessorConstructor::getTuple(cgh, eval.impl());
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
-template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
-    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
-  }
-};
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseTernaryOp
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
-    return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorCwiseSelectOp. This is a special case where there is no OP
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
-    return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseSelectOp. This is a special case where there is no OP
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
-  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
-    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
- }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
-#define TENSORMAPEXPR(CVQual, ACCType)\
-template <typename PlainObjectType, int Options_, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
-  -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
-    return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
-  }\
-};
-TENSORMAPEXPR(const, cl::sycl::access::mode::read)
-TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
-  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
-    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
-  -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
-    return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-template <typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
-template <typename OP, typename Dim, typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
-  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
-    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
-  }
-};
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-template <typename OP, typename Dim, typename Expr, typename Dev>
-struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
-: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
-
-/// template deduction for \ref ExtractAccessor
-template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
-  return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
-}
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
deleted file mode 100644
index 427125343..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclextractFunctors.h
- *
- * \brief:
- *  Used to extract all the functors allocated to each node of the expression
-*tree.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// \struct FunctorExtractor:  This struct is used to extract the functors
-/// constructed on
-/// the host-side, to pack them and reuse them in reconstruction of the
-/// expression on the device.
-/// We have to do that as in Eigen the functors are not stateless so we cannot
-/// re-instantiate them on the device.
-/// We have to pass instantiated functors to the device.
-// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
-template <typename Evaluator> struct FunctorExtractor{
-  typedef typename Evaluator::Dimensions Dimensions;
-  const Dimensions m_dimensions;
-  const Dimensions& dimensions() const { return m_dimensions; }
-  FunctorExtractor(const Evaluator& expr)
-  : m_dimensions(expr.dimensions()) {}
-
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  OP func;
-  FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
-  : rhsExpr(expr.impl()), func(expr.functor()) {}
-};
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
-: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseBinaryOp
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  OP func;
-  FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
-  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseBinaryOp
-template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >
-: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >{};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseTernaryOp
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
-struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
-  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
-  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
-  OP func;
-  FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
-  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseTernaryOp
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
-struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
-:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
-  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
-  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
-  FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
-  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
-:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
-  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
-template <typename LHSExpr, typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
-:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
-
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
-template <typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
-  FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
-  : rhsExpr(expr.impl()) {}
-};
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
-template <typename RHSExpr, typename Dev>
-struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
-: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
-
-template<typename Dim, size_t NumOutputDim> struct DimConstr {
-template<typename InDim>
-  static inline Dim getDim(InDim dims ) {return dims;}
-};
-
-template<typename Dim> struct DimConstr<Dim, 0> {
-  template<typename InDim>
-    static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
-};
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
-  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
-  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
-  const Dimensions m_dimensions;
-  const Dimensions& dimensions() const { return m_dimensions; }
-  FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
-  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
-};
-
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
-: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
-/// template deduction function for FunctorExtractor
-template <typename Evaluator>
-auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
-  return FunctorExtractor<Evaluator>(evaluator);
-}
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
deleted file mode 100644
index 25d1fac9b..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ /dev/null
@@ -1,114 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclLeafCount.h
- *
- * \brief:
- *  The leaf count used the pre-order expression tree traverse in order to name
- *  count the number of leaf nodes in the expression
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-/// \brief LeafCount used to counting terminal nodes. The total number of
-/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
-/// of the leaf node in a expression tree at compile time.
-template <typename Expr>
-struct LeafCount;
-
-template<typename... Args> struct CategoryCount;
-
-template<> struct CategoryCount<>
-{
-  static const size_t Count =0;
-};
-
-template<typename Arg, typename... Args>
-struct CategoryCount<Arg,Args...>{
-  static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
-struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
-  static const size_t Count =1;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorMap
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
-struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
-
-// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
-struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-// TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
-struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>
-struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>
-struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
-
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
-template <typename LHSExpr, typename RHSExpr>
-struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
-
-/// specialisation of the \ref LeafCount struct when the node type is
-/// TensorAssignOp is an exception. It is not the same as Unary
-template <typename LHSExpr, typename RHSExpr>
-struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-template <typename Expr>
-struct LeafCount<const TensorForcedEvalOp<Expr> > {
-    static const size_t Count =1;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
-template <typename Expr>
-struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
-template <typename Expr>
-struct LeafCount<const TensorEvalToOp<Expr> > {
-  static const size_t Count = 1 + CategoryCount<Expr>::Count;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-template <typename OP, typename Dim, typename Expr>
-struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
-    static const size_t Count =1;
-};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
-template <typename OP, typename Dim, typename Expr>
-struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-template <typename Expr>
-struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
-
-} /// namespace TensorSycl
-} /// namespace internal
-} /// namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
deleted file mode 100644
index d4c250c6d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ /dev/null
@@ -1,181 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclPlaceHolderExpr.h
- *
- * \brief:
- *  This is the specialisation of the placeholder expression based on the
- * operation type
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-/// \struct PlaceHolder
-/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
-/// tree.
-/// PlaceHolder contains the order of the leaf node in the expression tree.
-template <typename Scalar, size_t N>
-struct PlaceHolder {
-  static constexpr size_t I = N;
-  typedef Scalar Type;
-};
-
-/// \sttruct PlaceHolderExpression
-/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
-/// expression is a copy of expression type in which the TensorMap of the has
-/// been replaced with PlaceHolder.
-template <typename Expr, size_t N>
-struct PlaceHolderExpression;
-
-template<size_t N, typename... Args>
-struct CalculateIndex;
-
-template<size_t N, typename Arg>
-struct CalculateIndex<N, Arg>{
-  typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
-  typedef utility::tuple::Tuple<ArgType> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2>
-struct CalculateIndex<N, Arg1, Arg2>{
-  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
-  typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
-  typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
-  typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
-};
-
-template<size_t N, typename Arg1, typename Arg2, typename Arg3>
-struct CalculateIndex<N, Arg1, Arg2, Arg3> {
-  static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
-  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
-  typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
-  typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
-  typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
-  typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
-};
-
-template<template<class...> class Category , class OP, class TPL>
-struct CategoryHelper;
-
-template<template<class...> class Category , class OP, class ...T >
-struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
-  typedef Category<OP, T... > Type;
-};
-
-template<template<class...> class Category , class ...T >
-struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
-  typedef Category<T... > Type;
-};
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp,  TensorCwiseTernaryOp
-#define OPEXPRCATEGORY(CVQual)\
-template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
-struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
-  typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
-};
-
-OPEXPRCATEGORY(const)
-OPEXPRCATEGORY()
-#undef OPEXPRCATEGORY
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SELECTEXPR(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
-  typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
-};
-
-SELECTEXPR(const)
-SELECTEXPR()
-#undef SELECTEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorAssignOp
-#define ASSIGNEXPR(CVQual)\
-template <typename LHSExpr, typename RHSExpr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
-  typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
-};
-
-ASSIGNEXPR(const)
-ASSIGNEXPR()
-#undef ASSIGNEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorMap
-#define TENSORMAPEXPR(CVQual)\
-template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
-};
-
-TENSORMAPEXPR(const)
-TENSORMAPEXPR()
-#undef TENSORMAPEXPR
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorForcedEvalOp
-#define FORCEDEVAL(CVQual)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
-};
-
-FORCEDEVAL(const)
-FORCEDEVAL()
-#undef FORCEDEVAL
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorEvalToOp
-#define EVALTO(CVQual)\
-template <typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\
-  typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-EVALTO(const)
-EVALTO()
-#undef EVALTO
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLREDUCTION(CVQual)\
-template <typename OP, typename Dims, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
-  typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
-};
-SYCLREDUCTION(const)
-SYCLREDUCTION()
-#undef SYCLREDUCTION
-
-/// template deduction for \ref PlaceHolderExpression struct
-template <typename Expr>
-struct createPlaceHolderExpression {
-  static const size_t TotalLeaves = LeafCount<Expr>::Count;
-  typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
-};
-
-}  // internal
-}  // TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
deleted file mode 100644
index 7914b6fad..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Cummins Chris PhD student at The University of Edinburgh.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclRun.h
- *
- * \brief:
- * Schedule_kernel invoke an specialised version of kernel struct. The
- * specialisation is based on the data dimension in sycl buffer
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-/// The run function in tensor sycl convert the expression tree to a buffer
-/// based expression tree;
-/// creates the expression tree for the device with accessor to buffers;
-/// construct the kernel and submit it to the sycl queue.
-template <typename Expr, typename Dev>
-void run(Expr &expr, Dev &dev) {
-  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-  if (needs_assign) {
-    typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-    auto functors = internal::extractFunctors(evaluator);
-
-    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
-
-      // create a tuple of accessors from Evaluator
-      auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
-      const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
-      size_t GRange=range;
-      if (tileSize>GRange) tileSize=GRange;
-      else if(GRange>tileSize){
-        size_t xMode = GRange % tileSize;
-        if (xMode != 0) GRange += (tileSize - xMode);
-      }
-      // run the kernel
-      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
-        typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
-        auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-        auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-        if (itemID.get_global_linear_id() < range) {
-          device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
-        }
-      });
-    });
-    dev.m_queue.throw_asynchronous();
-  }
-
-  evaluator.cleanup();
-}
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
deleted file mode 100644
index 063b027e8..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ /dev/null
@@ -1,234 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensroSyclTuple.h
- *
- * \brief:
- *  Minimal implementation of std::tuple that can be used inside a SYCL kernel.
- *
-*****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-namespace utility {
-namespace tuple {
-/// \struct StaticIf
-/// \brief The StaticIf struct is used to statically choose the type based on the
-/// condition.
-template <bool, typename T = void> struct StaticIf;
-/// \brief specialisation of the \ref StaticIf when the condition is true
-template <typename T>
-struct StaticIf<true, T> {
-  typedef T type;
-};
-
-/// \struct Tuple
-/// \brief is a fixed-size collection of heterogeneous values
-/// \ztparam Ts...	-	the types of the elements that the tuple stores.
-/// Empty list is supported.
-template <class... Ts>
-struct Tuple {};
-
-/// \brief specialisation of the \ref Tuple class when the tuple has at least
-/// one element.
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-template <class T, class... Ts>
-struct Tuple<T, Ts...> {
-  Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
-  T head;
-  Tuple<Ts...> tail;
-};
-
-///\ struct ElemTypeHolder
-/// \brief ElemTypeHolder class is used to specify the types of the
-/// elements inside the tuple
-/// \tparam size_t the number of elements inside the tuple
-/// \tparam class the tuple class
-template <size_t, class>
-struct ElemTypeHolder;
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is 1
-template <class T, class... Ts>
-struct ElemTypeHolder<0, Tuple<T, Ts...> > {
-  typedef T type;
-};
-
-/// \brief specialisation of the \ref ElemTypeHolder class when the number of
-/// elements inside the tuple is bigger than 1. It recursively calls itself to
-/// detect the type of each element in the tuple
-/// \tparam T : the type of the first element in the tuple.
-/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
-/// \tparam K is the Kth element in the tuple
-template <size_t k, class T, class... Ts>
-struct ElemTypeHolder<k, Tuple<T, Ts...> > {
-  typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
-};
-
-/// get
-/// \brief Extracts the first element from the tuple.
-/// K=0 represents the first element of the tuple. The tuple cannot be empty.
-/// \tparam Ts... are the type of the elements in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return  typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
-
-#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
-template <size_t k, class... Ts> \
-typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
-get(CVQual Tuple<Ts...> &t) { \
-  static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
-  return t.head; \
-}
-
-TERMINATE_CONDS_TUPLE_GET(const)
-TERMINATE_CONDS_TUPLE_GET()
-#undef TERMINATE_CONDS_TUPLE_GET
-/// get
-/// \brief Extracts the Kth element from the tuple.
-///\tparam K is an integer value in [0,sizeof...(Types)).
-/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
-/// \tparam Ts... are the type of the elements  in the tuple.
-/// \param t is the tuple whose contents to extract
-/// \return  typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
-#define RECURSIVE_TUPLE_GET(CVQual) \
-template <size_t k, class T, class... Ts> \
-typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
-get(CVQual Tuple<T, Ts...> &t) { \
-  return utility::tuple::get<k - 1>(t.tail); \
-}
-RECURSIVE_TUPLE_GET(const)
-RECURSIVE_TUPLE_GET()
-#undef RECURSIVE_TUPLE_GET
-
-/// make_tuple
-/// \brief Creates a tuple object, deducing the target type from the types of
-/// arguments.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \param args zero or more arguments to construct the tuple from
-/// \return Tuple<Args...>
-template <typename... Args>
-Tuple<Args...> make_tuple(Args... args) {
-  return Tuple<Args...>(args...);
-}
-
-/// size
-/// \brief Provides access to the number of elements in a tuple as a
-/// compile-time constant expression.
-/// \tparam Args the type of the arguments to construct the tuple from
-/// \return size_t
-template <typename... Args>
-static constexpr size_t size(Tuple<Args...> &) {
-  return sizeof...(Args);
-}
-
-/// \struct IndexList
-/// \brief Creates a list of index from the elements in the tuple
-/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
-template <size_t... Is>
-struct IndexList {};
-
-/// \struct RangeBuilder
-/// \brief Collects internal details for generating index ranges [MIN, MAX)
-/// Declare primary template for index range builder
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder;
-
-/// \brief base Step: Specialisation of the \ref RangeBuilder when the
-/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
-/// \tparam MIN is the starting index of the tuple
-/// \tparam Is is [0 to sizeof...(tuple elements))
-template <size_t MIN, size_t... Is>
-struct RangeBuilder<MIN, MIN, Is...> {
-  typedef IndexList<Is...> type;
-};
-
-/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
-/// in this case we are recursively subtracting N by one and adding one
-/// index to Is... list until MIN==N
-/// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
-/// \tparam Is... are the list of generated index so far
-template <size_t MIN, size_t N, size_t... Is>
-struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
-
-/// \brief IndexRange that returns a [MIN, MAX) index range
-/// \tparam MIN is the starting index in the tuple
-/// \tparam MAX is the size of the tuple
-template <size_t MIN, size_t MAX>
-struct IndexRange: RangeBuilder<MIN, MAX>::type {};
-
-/// append_base
-/// \brief unpacking the elements of the input tuple t and creating a new tuple
-/// by adding element a at the end of it.
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \tparam I... is the list of index from [0 to sizeof...(t))
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T, size_t... I>
-Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
-  return utility::tuple::make_tuple(get<I>(t)..., a);
-}
-
-/// append
-/// \brief the deduction function for \ref append_base that automatically
-/// generate the \ref IndexRange
-///\tparam Args... the type of the elements inside the tuple t
-/// \tparam T the type of the new element going to be added at the end of tuple
-/// \param t the tuple on which we want to append a.
-/// \param a the new elements going to be added to the tuple
-/// \return Tuple<Args..., T>
-template <typename... Args, typename T>
-Tuple<Args..., T> append(Tuple<Args...> t, T a) {
-  return utility::tuple::append_base(t, a,  IndexRange<0, sizeof...(Args)>());
-}
-
-/// append_base
-/// \brief This is a specialisation of \ref append_base when we want to
-/// concatenate
-/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
-/// IndexRange for each of them and create an output tuple T that contains both
-/// elements of t1 and t2.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \tparam I1... is the list of index from [0 to sizeof...(t1))
-/// \tparam I2... is the list of index from [0 to sizeof...(t2))
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
-Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
-  return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
-}
-
-/// append
-/// \brief deduction function for \ref append_base when we are appending tuple
-/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
-/// automatically generated.
-///\tparam Args1... the type of the elements inside the tuple t1
-///\tparam Args2... the type of the elements inside the tuple t2
-/// \param t1 is the tuple on which we want to append t2.
-/// \param t2 is the tuple that is going to be added on t1.
-/// \return Tuple<Args1..., Args2...>
-template <typename... Args1, typename... Args2>
-Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
-  return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
-}
-}  // tuple
-}  // utility
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 000000000..926ecdd38
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -0,0 +1,303 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+namespace Eigen {
+
+/** \class TensorTrace
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor Trace class.
+  *
+  *
+  */
+
+namespace internal {
+template<typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template<typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
+{
+  typedef TensorTraceOp<Dims, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+      : m_xpr(expr), m_dims(dims) {
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const Dims m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
+{
+  typedef TensorTraceOp<Dims, ArgType> XprType;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  static const int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
+  {
+
+    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+
+    const Dims& op_dims = op.dims();
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op_dims[i] >= 0);
+      eigen_assert(op_dims[i] < NumInputDims);
+      m_reduced[op_dims[i]] = true;
+    }
+
+    // All the dimensions should be distinct to compute the trace
+    int num_distinct_reduce_dims = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        ++num_distinct_reduce_dims;
+      }
+    }
+
+    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+    // Compute the dimensions of the result.
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    int output_index = 0;
+    int reduced_index = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        m_reducedDims[reduced_index] = input_dims[i];
+        if (reduced_index > 0) {
+          // All the trace dimensions must have the same size
+          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+        }
+        ++reduced_index;
+      }
+      else {
+        m_dimensions[output_index] = input_dims[i];
+        ++output_index;
+      }
+    }
+
+    if (NumReducedDims != 0) {
+      m_traceDim = m_reducedDims[0];
+    }
+
+    // Compute the output strides
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+      }
+      else {
+        m_outputStrides.back() = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+
+    // Compute the input strides
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      }
+      else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      output_index = 0;
+      reduced_index = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if(m_reduced[i]) {
+          m_reducedStrides[reduced_index] = input_strides[i];
+          ++reduced_index;
+        }
+        else {
+          m_preservedStrides[output_index] = input_strides[i];
+          ++output_index;
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_dimensions;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Initialize the result
+    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+    Index index_stride = 0;
+    for (int i = 0; i < NumReducedDims; ++i) {
+      index_stride += m_reducedStrides[i];
+    }
+
+    // If trace is requested along all dimensions, starting index would be 0
+    Index cur_index = 0;
+    if (NumOutputDims != 0)
+      cur_index = firstInput(index);
+    for (Index i = 0; i < m_traceDim; ++i) {
+        result += m_impl.coeff(cur_index);
+        cur_index += index_stride;
+    }
+
+    return result;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+        values[i] = coeff(index + i);
+    }
+    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+    return result;
+  }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
+
+ protected:
+  // Given the output index, finds the first index in the input tensor used to compute the trace
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[0];
+    }
+    else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[NumOutputDims - 1];
+    }
+    return startInput;
+  }
+
+  Dimensions m_dimensions;
+  TensorEvaluator<ArgType, Device> m_impl;
+  // Initialize the size of the trace dimension
+  Index m_traceDim;
+  const Device EIGEN_DEVICE_REF m_device;
+  array<bool, NumInputDims> m_reduced;
+  array<Index, NumReducedDims> m_reducedDims;
+  array<Index, NumOutputDims> m_outputStrides;
+  array<Index, NumReducedDims> m_reducedStrides;
+  array<Index, NumOutputDims> m_preservedStrides;
+};
+
+
+} // End namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index ffcf8b00f..4f7fd340e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -59,6 +59,7 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
   template <typename T> struct MakePointer {
     typedef T* Type;
   };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -77,6 +78,7 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
   template <typename T> struct MakePointer {
     typedef T* Type;
   };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -99,6 +101,7 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
   };
+  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 template<typename PlainObjectType>
@@ -115,55 +118,56 @@ struct traits<TensorRef<PlainObjectType> >
     Options = BaseTraits::Options,
     Flags = BaseTraits::Flags
   };
+  typedef typename BaseTraits::PointerType PointerType;
 };
 
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType>
 struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 template<typename PlainObjectType>
 struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
@@ -175,50 +179,38 @@ template<typename T, int n=1, typename PlainObject = void> struct nested
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
 };
 
 
-template <typename PlainObjectType, int Options, template <class> class MakePointer>
-struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
-{
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
-};
-
-template <typename PlainObjectType, int Options, template <class> class MakePointer>
-struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
-{
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
-};
-
 template <typename PlainObjectType>
 struct nested<TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 template <typename PlainObjectType>
 struct nested<const TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>& type;
+  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
 };
 
 }  // end namespace internal
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 3523e7c94..d23f2e4c8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,6 +23,7 @@ struct static_val {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 0ca2cac84..0beb9ff09 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -22,6 +22,7 @@ namespace Eigen {
   * dimensions.
   */
 namespace internal {
+
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
 {
@@ -33,6 +34,8 @@ struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
 };
 
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -65,12 +68,12 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                             DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                             DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                             PaddingType padding_type, Scalar padding_value)
-      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-        m_padding_type(padding_type), m_padding_value(padding_value) {}
+                                                            : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                            m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+                                                            m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                            m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                            m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+                                                            m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
@@ -80,13 +83,13 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-        m_padding_left(padding_left), m_padding_right(padding_right),
-        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+                                                           : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+                                                           m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+                                                           m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+                                                           m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+                                                           m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+                                                           m_padding_left(padding_left), m_padding_right(padding_right),
+                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_planes() const { return m_patch_planes; }
@@ -173,19 +176,26 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
+ m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -248,12 +258,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
           m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
           m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
           m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
-          const Index dz = m_outputPlanes * m_plane_strides + m_patch_planes_eff - 1 - m_input_planes_eff;
-          const Index dy = m_outputRows * m_row_strides + m_patch_rows_eff - 1 - m_input_rows_eff;
-          const Index dx = m_outputCols * m_col_strides + m_patch_cols_eff - 1 - m_input_cols_eff;
-          m_planePaddingTop = dz - dz / 2;
-          m_rowPaddingTop = dy - dy / 2;
-          m_colPaddingLeft = dx - dx / 2;
+          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
+          m_planePaddingTop = dz / 2;
+          m_rowPaddingTop = dy / 2;
+          m_colPaddingLeft = dx / 2;
           break;
         }
         default:
@@ -322,6 +332,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
     // Fast representations of different variables.
     m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
     m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
     m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
     m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
@@ -341,12 +352,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -502,30 +513,38 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-  Index planePaddingTop() const { return m_planePaddingTop; }
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputPlanes() const { return m_outputPlanes; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userPlaneStride() const { return m_plane_strides; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInPlaneStride() const { return m_in_plane_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index planeInflateStride() const { return m_plane_inflate_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+    m_impl.bind(cgh);
+  }
+#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -535,7 +554,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   Dimensions m_dimensions;
 
-  // Parameters passed to the costructor.
+  // Parameters passed to the constructor.
   Index m_plane_strides;
   Index m_row_strides;
   Index m_col_strides;
@@ -600,6 +619,8 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   Scalar m_paddingValue;
 
   TensorEvaluator<ArgType, Device> m_impl;
+
+
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
index 0fe0b7c46..54bf9dbb3 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -17,7 +17,7 @@ namespace internal {
 namespace group_theory {
 
 /** \internal
-  * \file CXX11/Tensor/util/TemplateGroupTheory.h
+  * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
   * This file contains C++ templates that implement group theory algorithms.
   *
   * The algorithms allow for a compile-time analysis of finite groups.
@@ -167,7 +167,9 @@ template<
   typename elements,
   bool dont_add_current_element   // = false
 >
-struct dimino_first_step_elements_helper :
+struct dimino_first_step_elements_helper
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : // recursive inheritance is too difficult for Doxygen
   public dimino_first_step_elements_helper<
     Multiply,
     Equality,
@@ -187,6 +189,7 @@ template<
   typename elements
 >
 struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+#endif // EIGEN_PARSED_BY_DOXYGEN
 {
   typedef elements type;
   constexpr static int global_flags = Equality<current_element, id>::global_flags;
@@ -241,7 +244,7 @@ struct dimino_first_step_elements
   * multiplying all elements in the given subgroup with the new
   * coset representative. Note that the first element of the
   * subgroup is always the identity element, so the first element of
-  * ther result of this template is going to be the coset
+  * the result of this template is going to be the coset
   * representative itself.
   *
   * Note that this template accepts an additional boolean parameter
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
new file mode 100644
index 000000000..e4c59dc3d
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+namespace Eigen {
+
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_plain_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
+      eigen_plain_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_plain_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1){};
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 71d55552d..4549aa069 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -33,10 +33,10 @@ namespace Eigen {
 //   ec.Notify(true);
 //
 // Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
-// cheap, but they are executed only if the preceeding predicate check has
+// cheap, but they are executed only if the preceding predicate check has
 // failed.
 //
-// Algorihtm outline:
+// Algorithm outline:
 // There are two main variables: predicate (managed by user) and state_.
 // Operation closely resembles Dekker mutual algorithm:
 // https://en.wikipedia.org/wiki/Dekker%27s_algorithm
@@ -50,117 +50,114 @@ class EventCount {
  public:
   class Waiter;
 
-  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
-    eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
-    // Initialize epoch to something close to overflow to test overflow.
-    state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
+  EventCount(MaxSizeVector<Waiter>& waiters)
+      : state_(kStackMask), waiters_(waiters) {
+    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
   }
 
   ~EventCount() {
     // Ensure there are no waiters.
-    eigen_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
+    eigen_plain_assert(state_.load() == kStackMask);
   }
 
   // Prewait prepares for waiting.
-  // After calling this function the thread must re-check the wait predicate
-  // and call either CancelWait or CommitWait passing the same Waiter object.
-  void Prewait(Waiter* w) {
-    w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
-    std::atomic_thread_fence(std::memory_order_seq_cst);
+  // After calling Prewait, the thread must re-check the wait predicate
+  // and then call either CancelWait or CommitWait.
+  void Prewait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state);
+      uint64_t newstate = state + kWaiterInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_seq_cst))
+        return;
+    }
   }
 
-  // CommitWait commits waiting.
+  // CommitWait commits waiting after Prewait.
   void CommitWait(Waiter* w) {
+    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
     w->state = Waiter::kNotSignaled;
-    // Modification epoch of this waiter.
-    uint64_t epoch =
-        (w->epoch & kEpochMask) +
-        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+    const uint64_t me = (w - &waiters_[0]) | w->epoch;
     uint64_t state = state_.load(std::memory_order_seq_cst);
     for (;;) {
-      if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
-        // calls either CancelWait or CommitWait, or is notified.
-        EIGEN_THREAD_YIELD();
-        state = state_.load(std::memory_order_seq_cst);
-        continue;
+      CheckState(state, true);
+      uint64_t newstate;
+      if ((state & kSignalMask) != 0) {
+        // Consume the signal and return immidiately.
+        newstate = state - kWaiterInc - kSignalInc;
+      } else {
+        // Remove this thread from pre-wait counter and add to the waiter stack.
+        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
+        w->next.store(state & (kStackMask | kEpochMask),
+                      std::memory_order_relaxed);
       }
-      // We've already been notified.
-      if (int64_t((state & kEpochMask) - epoch) > 0) return;
-      // Remove this thread from prewait counter and add it to the waiter list.
-      eigen_assert((state & kWaiterMask) != 0);
-      uint64_t newstate = state - kWaiterInc + kEpochInc;
-      newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
-      if ((state & kStackMask) == kStackMask)
-        w->next.store(nullptr, std::memory_order_relaxed);
-      else
-        w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
+      CheckState(newstate);
       if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_release))
-        break;
+                                       std::memory_order_acq_rel)) {
+        if ((state & kSignalMask) == 0) {
+          w->epoch += kEpochInc;
+          Park(w);
+        }
+        return;
+      }
     }
-    Park(w);
   }
 
   // CancelWait cancels effects of the previous Prewait call.
-  void CancelWait(Waiter* w) {
-    uint64_t epoch =
-        (w->epoch & kEpochMask) +
-        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
+  void CancelWait() {
     uint64_t state = state_.load(std::memory_order_relaxed);
     for (;;) {
-      if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
-        // calls either CancelWait or CommitWait, or is notified.
-        EIGEN_THREAD_YIELD();
-        state = state_.load(std::memory_order_relaxed);
-        continue;
-      }
-      // We've already been notified.
-      if (int64_t((state & kEpochMask) - epoch) > 0) return;
-      // Remove this thread from prewait counter.
-      eigen_assert((state & kWaiterMask) != 0);
-      if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
-                                       std::memory_order_relaxed))
+      CheckState(state, true);
+      uint64_t newstate = state - kWaiterInc;
+      // We don't know if the thread was also notified or not,
+      // so we should not consume a signal unconditionaly.
+      // Only if number of waiters is equal to number of signals,
+      // we know that the thread was notified and we must take away the signal.
+      if (((state & kWaiterMask) >> kWaiterShift) ==
+          ((state & kSignalMask) >> kSignalShift))
+        newstate -= kSignalInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate,
+                                       std::memory_order_acq_rel))
         return;
     }
   }
 
   // Notify wakes one or all waiting threads.
   // Must be called after changing the associated wait predicate.
-  void Notify(bool all) {
+  void Notify(bool notifyAll) {
     std::atomic_thread_fence(std::memory_order_seq_cst);
     uint64_t state = state_.load(std::memory_order_acquire);
     for (;;) {
+      CheckState(state);
+      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
       // Easy case: no waiters.
-      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
-        return;
-      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      if ((state & kStackMask) == kStackMask && waiters == signals) return;
       uint64_t newstate;
-      if (all) {
-        // Reset prewait counter and empty wait list.
-        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
-      } else if (waiters) {
+      if (notifyAll) {
+        // Empty wait stack and set signal to number of pre-wait threads.
+        newstate =
+            (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
+      } else if (signals < waiters) {
         // There is a thread in pre-wait state, unblock it.
-        newstate = state + kEpochInc - kWaiterInc;
+        newstate = state + kSignalInc;
       } else {
         // Pop a waiter from list and unpark it.
         Waiter* w = &waiters_[state & kStackMask];
-        Waiter* wnext = w->next.load(std::memory_order_relaxed);
-        uint64_t next = kStackMask;
-        if (wnext != nullptr) next = wnext - &waiters_[0];
-        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
-        // can't happen because a waiter is re-pushed onto the stack only after
-        // it was in the pre-wait state which inevitably leads to epoch
-        // increment.
-        newstate = (state & kEpochMask) + next;
+        uint64_t next = w->next.load(std::memory_order_relaxed);
+        newstate = (state & (kWaiterMask | kSignalMask)) | next;
       }
+      CheckState(newstate);
       if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acquire)) {
-        if (!all && waiters) return;  // unblocked pre-wait thread
+                                       std::memory_order_acq_rel)) {
+        if (!notifyAll && (signals < waiters))
+          return;  // unblocked pre-wait thread
         if ((state & kStackMask) == kStackMask) return;
         Waiter* w = &waiters_[state & kStackMask];
-        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
+        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
         Unpark(w);
         return;
       }
@@ -169,12 +166,13 @@ class EventCount {
 
   class Waiter {
     friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter
+    // objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
     std::mutex mu;
     std::condition_variable cv;
-    uint64_t epoch;
-    unsigned state;
+    uint64_t epoch = 0;
+    unsigned state = kNotSignaled;
     enum {
       kNotSignaled,
       kWaiting,
@@ -184,23 +182,41 @@ class EventCount {
 
  private:
   // State_ layout:
-  // - low kStackBits is a stack of waiters committed wait.
+  // - low kWaiterBits is a stack of waiters committed wait
+  //   (indexes in waiters_ array are used as stack elements,
+  //   kStackMask means empty stack).
   // - next kWaiterBits is count of waiters in prewait state.
-  // - next kEpochBits is modification counter.
-  static const uint64_t kStackBits = 16;
-  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
-  static const uint64_t kWaiterBits = 16;
-  static const uint64_t kWaiterShift = 16;
+  // - next kWaiterBits is count of pending signals.
+  // - remaining bits are ABA counter for the stack.
+  //   (stored in Waiter node and incremented on push).
+  static const uint64_t kWaiterBits = 14;
+  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
+  static const uint64_t kWaiterShift = kWaiterBits;
   static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
                                       << kWaiterShift;
-  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
-  static const uint64_t kEpochBits = 32;
-  static const uint64_t kEpochShift = 32;
+  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
+  static const uint64_t kSignalShift = 2 * kWaiterBits;
+  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
+                                      << kSignalShift;
+  static const uint64_t kSignalInc = 1ull << kSignalShift;
+  static const uint64_t kEpochShift = 3 * kWaiterBits;
+  static const uint64_t kEpochBits = 64 - kEpochShift;
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
   std::atomic<uint64_t> state_;
   MaxSizeVector<Waiter>& waiters_;
 
+  static void CheckState(uint64_t state, bool waiter = false) {
+    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
+    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+    eigen_plain_assert(waiters >= signals);
+    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
+    eigen_plain_assert(!waiter || waiters > 0);
+    (void)waiters;
+    (void)signals;
+  }
+
   void Park(Waiter* w) {
     std::unique_lock<std::mutex> lock(w->mu);
     while (w->state != Waiter::kSignaled) {
@@ -209,10 +225,10 @@ class EventCount {
     }
   }
 
-  void Unpark(Waiter* waiters) {
-    Waiter* next = nullptr;
-    for (Waiter* w = waiters; w; w = next) {
-      next = w->next.load(std::memory_order_relaxed);
+  void Unpark(Waiter* w) {
+    for (Waiter* next; w; w = next) {
+      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
+      next = wnext == kStackMask ? nullptr : &waiters_[wnext];
       unsigned state;
       {
         std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 354bce52a..23a2b5467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -10,79 +10,116 @@
 #ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 #define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 
-
 namespace Eigen {
 
 template <typename Environment>
-class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  public:
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
-  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
+  ThreadPoolTempl(int num_threads, Environment env = Environment())
+      : ThreadPoolTempl(num_threads, true, env) {}
+
+  ThreadPoolTempl(int num_threads, bool allow_spinning,
+                  Environment env = Environment())
       : env_(env),
-        threads_(num_threads),
-        queues_(num_threads),
-        coprimes_(num_threads),
+        num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        thread_data_(num_threads),
+        all_coprimes_(num_threads),
         waiters_(num_threads),
+        global_steal_partition_(EncodePartition(0, num_threads_)),
         blocked_(0),
         spinning_(0),
         done_(false),
+        cancelled_(false),
         ec_(waiters_) {
-    waiters_.resize(num_threads);
-
-    // Calculate coprimes of num_threads.
-    // Coprimes are used for a random walk over all threads in Steal
+    waiters_.resize(num_threads_);
+    // Calculate coprimes of all numbers [1, num_threads].
+    // Coprimes are used for random walks over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
-    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // a random starting thread index t and calculate num_threads - 1 subsequent
     // indices as (t + coprime) % num_threads, we will cover all threads without
     // repetitions (effectively getting a presudo-random permutation of thread
     // indices).
-    for (int i = 1; i <= num_threads; i++) {
-      unsigned a = i;
-      unsigned b = num_threads;
-      // If GCD(a, b) == 1, then a and b are coprimes.
-      while (b != 0) {
-        unsigned tmp = a;
-        a = b;
-        b = tmp % b;
-      }
-      if (a == 1) {
-        coprimes_.push_back(i);
-      }
-    }
-    for (int i = 0; i < num_threads; i++) {
-      queues_.push_back(new Queue());
+    eigen_plain_assert(num_threads_ < kMaxThreads);
+    for (int i = 1; i <= num_threads_; ++i) {
+      all_coprimes_.emplace_back(i);
+      ComputeCoprimes(i, &all_coprimes_.back());
     }
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+#ifndef EIGEN_THREAD_LOCAL
+    init_barrier_.reset(new Barrier(num_threads_));
+#endif
+    thread_data_.resize(num_threads_);
+    for (int i = 0; i < num_threads_; i++) {
+      SetStealPartition(i, EncodePartition(0, num_threads_));
+      thread_data_[i].thread.reset(
+          env_.CreateThread([this, i]() { WorkerLoop(i); }));
     }
+#ifndef EIGEN_THREAD_LOCAL
+    // Wait for workers to initialize per_thread_map_. Otherwise we might race
+    // with them in Schedule or CurrentThreadId.
+    init_barrier_->Wait();
+#endif
   }
 
-  ~NonBlockingThreadPoolTempl() {
+  ~ThreadPoolTempl() {
     done_ = true;
+
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
-    ec_.Notify(true);
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < thread_data_.size(); i++) {
+        thread_data_[i].queue.Flush();
+      }
+    }
+    // Join threads explicitly (by destroying) to avoid destruction order within
+    // this class.
+    for (size_t i = 0; i < thread_data_.size(); ++i)
+      thread_data_[i].thread.reset();
+  }
+
+  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
 
-    // Join threads explicitly to avoid destruction order issues.
-    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
-    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
+    // Pass this information to each thread queue.
+    for (int i = 0; i < num_threads_; i++) {
+      const auto& pair = partitions[i];
+      unsigned start = pair.first, end = pair.second;
+      AssertBounds(start, end);
+      unsigned val = EncodePartition(start, end);
+      SetStealPartition(i, val);
+    }
+  }
+
+  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
+    ScheduleWithHint(std::move(fn), 0, num_threads_);
   }
 
-  void Schedule(std::function<void()> fn) {
+  void ScheduleWithHint(std::function<void()> fn, int start,
+                        int limit) override {
     Task t = env_.CreateTask(std::move(fn));
     PerThread* pt = GetPerThread();
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
-      Queue* q = queues_[pt->thread_id];
-      t = q->PushFront(std::move(t));
+      Queue& q = thread_data_[pt->thread_id].queue;
+      t = q.PushFront(std::move(t));
     } else {
       // A free-standing thread (or worker of another pool), push onto a random
       // queue.
-      Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
-      t = q->PushBack(std::move(t));
+      eigen_plain_assert(start < limit);
+      eigen_plain_assert(limit <= num_threads_);
+      int num_queues = limit - start;
+      int rnd = Rand(&pt->rand) % num_queues;
+      eigen_plain_assert(start + rnd < limit);
+      Queue& q = thread_data_[start + rnd].queue;
+      t = q.PushBack(std::move(t));
     }
     // Note: below we touch this after making w available to worker threads.
     // Strictly speaking, this can lead to a racy-use-after-free. Consider that
@@ -91,19 +128,32 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // completes overall computations, which in turn leads to destruction of
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f)
+    if (!t.f) {
       ec_.Notify(false);
-    else
+    } else {
       env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
   }
 
-  int NumThreads() const final {
-    return static_cast<int>(threads_.size());
+  void Cancel() EIGEN_OVERRIDE {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
+#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
+    for (size_t i = 0; i < thread_data_.size(); i++) {
+      thread_data_[i].thread->OnCancel();
+    }
+#endif
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
   }
 
-  int CurrentThreadId() const final {
-    const PerThread* pt =
-        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+  int NumThreads() const EIGEN_FINAL { return num_threads_; }
+
+  int CurrentThreadId() const EIGEN_FINAL {
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
     if (pt->pool == this) {
       return pt->thread_id;
     } else {
@@ -112,72 +162,191 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 
  private:
+  // Create a single atomic<int> that encodes start and limit information for
+  // each thread.
+  // We expect num_threads_ < 65536, so we can store them in a single
+  // std::atomic<unsigned>.
+  // Exposed publicly as static functions so that external callers can reuse
+  // this encode/decode logic for maintaining their own thread-safe copies of
+  // scheduling and steal domain(s).
+  static const int kMaxPartitionBits = 16;
+  static const int kMaxThreads = 1 << kMaxPartitionBits;
+
+  inline unsigned EncodePartition(unsigned start, unsigned limit) {
+    return (start << kMaxPartitionBits) | limit;
+  }
+
+  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  void AssertBounds(int start, int end) {
+    eigen_plain_assert(start >= 0);
+    eigen_plain_assert(start < end);  // non-zero sized partition
+    eigen_plain_assert(end <= num_threads_);
+  }
+
+  inline void SetStealPartition(size_t i, unsigned val) {
+    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
+  }
+
+  inline unsigned GetStealPartition(int i) {
+    return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
+  }
+
+  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
+    for (int i = 1; i <= N; i++) {
+      unsigned a = i;
+      unsigned b = N;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes->push_back(i);
+      }
+    }
+  }
+
   typedef typename Environment::EnvThread Thread;
 
   struct PerThread {
-    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
-    NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    uint64_t rand;  // Random generator state.
-    int thread_id;  // Worker thread index in pool.
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+#ifndef EIGEN_THREAD_LOCAL
+    // Prevent false sharing.
+    char pad_[128];
+#endif
+  };
+
+  struct ThreadData {
+    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+    std::unique_ptr<Thread> thread;
+    std::atomic<unsigned> steal_partition;
+    Queue queue;
   };
 
   Environment env_;
-  MaxSizeVector<Thread*> threads_;
-  MaxSizeVector<Queue*> queues_;
-  MaxSizeVector<unsigned> coprimes_;
+  const int num_threads_;
+  const bool allow_spinning_;
+  MaxSizeVector<ThreadData> thread_data_;
+  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
   MaxSizeVector<EventCount::Waiter> waiters_;
+  unsigned global_steal_partition_;
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
   EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+  std::unique_ptr<Barrier> init_barrier_;
+  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
+  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif
 
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+    std::unique_ptr<PerThread> new_pt(new PerThread());
+    per_thread_map_mutex_.lock();
+    bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
+    eigen_plain_assert(insertOK);
+    EIGEN_UNUSED_VARIABLE(insertOK);
+    per_thread_map_mutex_.unlock();
+    init_barrier_->Notify();
+    init_barrier_->Wait();
+#endif
     PerThread* pt = GetPerThread();
     pt->pool = this;
-    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->rand = GlobalThreadIdHash();
     pt->thread_id = thread_id;
-    Queue* q = queues_[thread_id];
+    Queue& q = thread_data_[thread_id].queue;
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    for (;;) {
-      Task t = q->PopFront();
-      if (!t.f) {
-        t = Steal();
+    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
+    // proportional to num_threads_ and we assume that new work is scheduled at
+    // a constant rate, so we set spin_count to 5000 / num_threads_. The
+    // constant was picked based on a fair dice roll, tune it.
+    const int spin_count =
+        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+      // victim queues it might reverse the order in which ops are executed
+      // compared to the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads the single thread
+      // pools tend to be used for.
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        for (int i = 0; i < spin_count && !t.f; i++) {
+          if (!cancelled_.load(std::memory_order_relaxed)) {
+            t = q.PopFront();
+          }
+        }
         if (!t.f) {
-          // Leave one thread spinning. This reduces latency.
-          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
-          // Also, the time it takes to attempt to steal work 1000 times depends
-          // on the size of the thread pool. However the speed at which the user
-          // of the thread pool submit tasks is independent of the size of the
-          // pool. Consider a time based limit instead.
-          if (!spinning_ && !spinning_.exchange(true)) {
-            for (int i = 0; i < 1000 && !t.f; i++) {
-              t = Steal();
-            }
-            spinning_ = false;
+          if (!WaitForWork(waiter, &t)) {
+            return;
           }
+        }
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
+      }
+    } else {
+      while (!cancelled_) {
+        Task t = q.PopFront();
+        if (!t.f) {
+          t = LocalSteal();
           if (!t.f) {
-            if (!WaitForWork(waiter, &t)) {
-              return;
+            t = GlobalSteal();
+            if (!t.f) {
+              // Leave one thread spinning. This reduces latency.
+              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+                for (int i = 0; i < spin_count && !t.f; i++) {
+                  if (!cancelled_.load(std::memory_order_relaxed)) {
+                    t = GlobalSteal();
+                  } else {
+                    return;
+                  }
+                }
+                spinning_ = false;
+              }
+              if (!t.f) {
+                if (!WaitForWork(waiter, &t)) {
+                  return;
+                }
+              }
             }
           }
         }
-      }
-      if (t.f) {
-        env_.ExecuteTask(t);
+        if (t.f) {
+          env_.ExecuteTask(t);
+        }
       }
     }
   }
 
-  // Steal tries to steal work from other worker threads in best-effort manner.
-  Task Steal() {
+  // Steal tries to steal work from other worker threads in the range [start,
+  // limit) in best-effort manner.
+  Task Steal(unsigned start, unsigned limit) {
     PerThread* pt = GetPerThread();
-    const size_t size = queues_.size();
+    const size_t size = limit - start;
     unsigned r = Rand(&pt->rand);
-    unsigned inc = coprimes_[r % coprimes_.size()];
-    unsigned victim = r % size;
+    // Reduce r into [0, size) range, this utilizes trick from
+    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
+    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+    unsigned inc = all_coprimes_[size - 1][index];
+
     for (unsigned i = 0; i < size; i++) {
-      Task t = queues_[victim]->PopBack();
+      eigen_plain_assert(start + victim < limit);
+      Task t = thread_data_[start + victim].queue.PopBack();
       if (t.f) {
         return t;
       }
@@ -189,27 +358,52 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return Task();
   }
 
+  // Steals work within threads belonging to the partition.
+  Task LocalSteal() {
+    PerThread* pt = GetPerThread();
+    unsigned partition = GetStealPartition(pt->thread_id);
+    // If thread steal partition is the same as global partition, there is no
+    // need to go through the steal loop twice.
+    if (global_steal_partition_ == partition) return Task();
+    unsigned start, limit;
+    DecodePartition(partition, &start, &limit);
+    AssertBounds(start, limit);
+
+    return Steal(start, limit);
+  }
+
+  // Steals work from any other thread in the pool.
+  Task GlobalSteal() {
+    return Steal(0, num_threads_);
+  }
+
+
   // WaitForWork blocks until new work is available (returns true), or if it is
   // time to exit (returns false). Can optionally return a task to execute in t
   // (in such case t.f != nullptr on return).
   bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
-    eigen_assert(!t->f);
+    eigen_plain_assert(!t->f);
     // We already did best-effort emptiness check in Steal, so prepare for
     // blocking.
-    ec_.Prewait(waiter);
+    ec_.Prewait();
     // Now do a reliable emptiness check.
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
-      ec_.CancelWait(waiter);
-      *t = queues_[victim]->PopBack();
-      return true;
+      ec_.CancelWait();
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = thread_data_[victim].queue.PopBack();
+        return true;
+      }
     }
     // Number of blocked threads is used as termination condition.
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    if (done_ && blocked_ == threads_.size()) {
-      ec_.CancelWait(waiter);
+    // TODO is blocked_ required to be unsigned?
+    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
+      ec_.CancelWait();
       // Almost done, but need to re-check queues.
       // Consider that all queues are empty and all worker threads are preempted
       // right after incrementing blocked_ above. Now a free-standing thread
@@ -236,12 +430,15 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
 
   int NonEmptyQueueIndex() {
     PerThread* pt = GetPerThread();
-    const size_t size = queues_.size();
+    // We intentionally design NonEmptyQueueIndex to steal work from
+    // anywhere in the queue so threads don't block in WaitForWork() forever
+    // when all threads in their partition go to sleep. Steal is still local.
+    const size_t size = thread_data_.size();
     unsigned r = Rand(&pt->rand);
-    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
     unsigned victim = r % size;
     for (unsigned i = 0; i < size; i++) {
-      if (!queues_[victim]->Empty()) {
+      if (!thread_data_[victim].queue.Empty()) {
         return victim;
       }
       victim += inc;
@@ -252,10 +449,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return -1;
   }
 
-  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+    static PerThread dummy;
+    auto it = per_thread_map_.find(GlobalThreadIdHash());
+    if (it == per_thread_map_.end()) {
+      return &dummy;
+    } else {
+      return it->second.get();
+    }
+#else
     EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
     return pt;
+#endif
   }
 
   static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
@@ -263,11 +474,12 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // Update the internal state
     *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
     // Generate the random output (using the PCG-XSH-RS scheme)
-    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+    return static_cast<unsigned>((current ^ (current >> 22)) >>
+                                 (22 + (current >> 61)));
   }
 };
 
-typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
 
 }  // namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 05ed76cbe..b572ebcdf 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -10,7 +10,6 @@
 #ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 #define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 
-
 namespace Eigen {
 
 // RunQueue is a fixed-size, partially non-blocking deque or Work items.
@@ -40,14 +39,14 @@ class RunQueue {
  public:
   RunQueue() : front_(0), back_(0) {
     // require power-of-two for fast masking
-    eigen_assert((kSize & (kSize - 1)) == 0);
-    eigen_assert(kSize > 2);            // why would you do this?
-    eigen_assert(kSize <= (64 << 10));  // leave enough space for counter
+    eigen_plain_assert((kSize & (kSize - 1)) == 0);
+    eigen_plain_assert(kSize > 2);            // why would you do this?
+    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
     for (unsigned i = 0; i < kSize; i++)
       array_[i].state.store(kEmpty, std::memory_order_relaxed);
   }
 
-  ~RunQueue() { eigen_assert(Size() == 0); }
+  ~RunQueue() { eigen_plain_assert(Size() == 0); }
 
   // PushFront inserts w at the beginning of the queue.
   // If queue is full returns w, otherwise returns default-constructed Work.
@@ -98,11 +97,9 @@ class RunQueue {
   }
 
   // PopBack removes and returns the last elements in the queue.
-  // Can fail spuriously.
   Work PopBack() {
     if (Empty()) return Work();
-    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
-    if (!lock) return Work();
+    std::unique_lock<std::mutex> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
@@ -116,11 +113,10 @@ class RunQueue {
   }
 
   // PopBackHalf removes and returns half last elements in the queue.
-  // Returns number of elements removed. But can also fail spuriously.
+  // Returns number of elements removed.
   unsigned PopBackHalf(std::vector<Work>* result) {
     if (Empty()) return 0;
-    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
-    if (!lock) return 0;
+    std::unique_lock<std::mutex> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     unsigned size = Size();
     unsigned mid = back;
@@ -131,15 +127,14 @@ class RunQueue {
       Elem* e = &array_[mid & kMask];
       uint8_t s = e->state.load(std::memory_order_relaxed);
       if (n == 0) {
-        if (s != kReady ||
-            !e->state.compare_exchange_strong(s, kBusy,
-                                              std::memory_order_acquire))
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
           continue;
         start = mid;
       } else {
         // Note: no need to store temporal kBusy, we exclusively own these
         // elements.
-        eigen_assert(s == kReady);
+        eigen_plain_assert(s == kReady);
       }
       result->push_back(std::move(e->w));
       e->state.store(kEmpty, std::memory_order_release);
@@ -152,30 +147,18 @@ class RunQueue {
 
   // Size returns current queue size.
   // Can be called by any thread at any time.
-  unsigned Size() const {
-    // Emptiness plays critical role in thread pool blocking. So we go to great
-    // effort to not produce false positives (claim non-empty queue as empty).
-    for (;;) {
-      // Capture a consistent snapshot of front/tail.
-      unsigned front = front_.load(std::memory_order_acquire);
-      unsigned back = back_.load(std::memory_order_acquire);
-      unsigned front1 = front_.load(std::memory_order_relaxed);
-      if (front != front1) continue;
-      int size = (front & kMask2) - (back & kMask2);
-      // Fix overflow.
-      if (size < 0) size += 2 * kSize;
-      // Order of modification in push/pop is crafted to make the queue look
-      // larger than it is during concurrent modifications. E.g. pop can
-      // decrement size before the corresponding push has incremented it.
-      // So the computed size can be up to kSize + 1, fix it.
-      if (size > static_cast<int>(kSize)) size = kSize;
-      return size;
-    }
-  }
+  unsigned Size() const { return SizeOrNotEmpty<true>(); }
 
   // Empty tests whether container is empty.
   // Can be called by any thread at any time.
-  bool Empty() const { return Size() == 0; }
+  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
+
+  // Delete all the elements from the queue.
+  void Flush() {
+    while (!Empty()) {
+      PopFront();
+    }
+  }
 
  private:
   static const unsigned kMask = kSize - 1;
@@ -191,7 +174,7 @@ class RunQueue {
   };
   std::mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
-  // front/back, repsectively. The remaining bits contain modification counters
+  // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
   // between empty and full conditions (if we would use log(kSize) bits for
   // position, these conditions would be indistinguishable); (2) obtain
@@ -201,6 +184,49 @@ class RunQueue {
   std::atomic<unsigned> back_;
   Elem array_[kSize];
 
+  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+  // only whether the size is 0 is guaranteed to be correct.
+  // Can be called by any thread at any time.
+  template<bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        return CalculateSize(front, back);
+      } else {
+        // This value will be 0 if the queue is empty, and undefined otherwise.
+        unsigned maybe_zero = ((front ^ back) & kMask2);
+        // Queue size estimate must agree with maybe zero check on the queue
+        // empty/non-empty state.
+        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+        return maybe_zero;
+      }
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE
+  unsigned CalculateSize(unsigned front, unsigned back) const {
+    int size = (front & kMask2) - (back & kMask2);
+    // Fix overflow.
+    if (size < 0) size += 2 * kSize;
+    // Order of modification in push/pop is crafted to make the queue look
+    // larger than it is during concurrent modifications. E.g. push can
+    // increment size before the corresponding pop has decremented it.
+    // So the computed size can be up to kSize + 1, fix it.
+    if (size > static_cast<int>(kSize)) size = kSize;
+    return static_cast<unsigned>(size);
+  }
+
   RunQueue(const RunQueue&) = delete;
   void operator=(const RunQueue&) = delete;
 };
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
deleted file mode 100644
index e75d0f467..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
-#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
-
-namespace Eigen {
-
-// The implementation of the ThreadPool type ensures that the Schedule method
-// runs the functions it is provided in FIFO order when the scheduling is done
-// by a single thread.
-// Environment provides a way to create threads and also allows to intercept
-// task submission and execution.
-template <typename Environment>
-class SimpleThreadPoolTempl : public ThreadPoolInterface {
- public:
-  // Construct a pool that contains "num_threads" threads.
-  explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
-      : env_(env), threads_(num_threads), waiters_(num_threads) {
-    for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
-    }
-  }
-
-  // Wait until all scheduled work has finished and then destroy the
-  // set of threads.
-  ~SimpleThreadPoolTempl() {
-    {
-      // Wait for all work to get done.
-      std::unique_lock<std::mutex> l(mu_);
-      while (!pending_.empty()) {
-        empty_.wait(l);
-      }
-      exiting_ = true;
-
-      // Wakeup all waiters.
-      for (auto w : waiters_) {
-        w->ready = true;
-        w->task.f = nullptr;
-        w->cv.notify_one();
-      }
-    }
-
-    // Wait for threads to finish.
-    for (auto t : threads_) {
-      delete t;
-    }
-  }
-
-  // Schedule fn() for execution in the pool of threads. The functions are
-  // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) final {
-    Task t = env_.CreateTask(std::move(fn));
-    std::unique_lock<std::mutex> l(mu_);
-    if (waiters_.empty()) {
-      pending_.push_back(std::move(t));
-    } else {
-      Waiter* w = waiters_.back();
-      waiters_.pop_back();
-      w->ready = true;
-      w->task = std::move(t);
-      w->cv.notify_one();
-    }
-  }
-
-  int NumThreads() const final {
-    return static_cast<int>(threads_.size());
-  }
-
-  int CurrentThreadId() const final {
-    const PerThread* pt = this->GetPerThread();
-    if (pt->pool == this) {
-      return pt->thread_id;
-    } else {
-      return -1;
-    }
-  }
-
- protected:
-  void WorkerLoop(int thread_id) {
-    std::unique_lock<std::mutex> l(mu_);
-    PerThread* pt = GetPerThread();
-    pt->pool = this;
-    pt->thread_id = thread_id;
-    Waiter w;
-    Task t;
-    while (!exiting_) {
-      if (pending_.empty()) {
-        // Wait for work to be assigned to me
-        w.ready = false;
-        waiters_.push_back(&w);
-        while (!w.ready) {
-          w.cv.wait(l);
-        }
-        t = w.task;
-        w.task.f = nullptr;
-      } else {
-        // Pick up pending work
-        t = std::move(pending_.front());
-        pending_.pop_front();
-        if (pending_.empty()) {
-          empty_.notify_all();
-        }
-      }
-      if (t.f) {
-        mu_.unlock();
-        env_.ExecuteTask(t);
-        t.f = nullptr;
-        mu_.lock();
-      }
-    }
-  }
-
- private:
-  typedef typename Environment::Task Task;
-  typedef typename Environment::EnvThread Thread;
-
-  struct Waiter {
-    std::condition_variable cv;
-    Task task;
-    bool ready;
-  };
-
-  struct PerThread {
-    constexpr PerThread() : pool(NULL), thread_id(-1) { }
-    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    int thread_id;                // Worker thread index in pool.
-  };
-
-  Environment env_;
-  std::mutex mu_;
-  MaxSizeVector<Thread*> threads_;  // All threads
-  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;        // Queue of pending work
-  std::condition_variable empty_;   // Signaled on pending_.empty()
-  bool exiting_ = false;
-
-  PerThread* GetPerThread() const {
-    EIGEN_THREAD_LOCAL PerThread per_thread;
-    return &per_thread;
-  }
-};
-
-typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
new file mode 100644
index 000000000..a05685f11
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+
+// Try to come up with a portable way to cancel a thread
+#if EIGEN_OS_GNULINUX
+  #define EIGEN_THREAD_CANCEL(t)                  \
+    pthread_cancel(t.native_handle());
+  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
+#else
+#define EIGEN_THREAD_CANCEL(t)
+#endif
+
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index 399f95cc1..d94a06416 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,6 +23,8 @@ struct StlThreadEnvironment {
    public:
     EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() { }
 
    private:
     std::thread thr_;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
index cfa221732..4e6847404 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -10,13 +10,292 @@
 #ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 #define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 
-// Try to come up with a portable implementation of thread local variables
-#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
-#define EIGEN_THREAD_LOCAL static __thread
-#elif EIGEN_COMP_CLANG
-#define EIGEN_THREAD_LOCAL static __thread
+#ifdef EIGEN_AVOID_THREAD_LOCAL
+
+#ifdef EIGEN_THREAD_LOCAL
+#undef EIGEN_THREAD_LOCAL
+#endif
+
 #else
+
+#if EIGEN_MAX_CPP_VER >= 11 &&                         \
+    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
+     __has_feature(cxx_thread_local)                || \
+     (EIGEN_COMP_MSVC >= 1900) )
 #define EIGEN_THREAD_LOCAL static thread_local
 #endif
 
+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if defined(__apple_build_version__) &&     \
+    ((__apple_build_version__ < 8000042) || \
+     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif  // __has_include(<android/ndk-version.h>)
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+    defined(__NDK_MINOR__) &&                                               \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif  // defined(__ANDROID__) && defined(__clang__)
+
+#endif  // EIGEN_AVOID_THREAD_LOCAL
+
+namespace Eigen {
+
+namespace internal {
+template <typename T>
+struct ThreadLocalNoOpInitialize {
+  void operator()(T&) const {}
+};
+
+template <typename T>
+struct ThreadLocalNoOpRelease {
+  void operator()(T&) const {}
+};
+
+}  // namespace internal
+
+// Thread local container for elements of type T, that does not use thread local
+// storage. As long as the number of unique threads accessing this storage
+// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
+// use a mutex for synchronization.
+//
+// Type `T` has to be default constructible, and by default each thread will get
+// a default constructed value. It is possible to specify custom `initialize`
+// callable, that will be called lazily from each thread accessing this object,
+// and will be passed a default initialized object of type `T`. Also it's
+// possible to pass a custom `release` callable, that will be invoked before
+// calling ~T().
+//
+// Example:
+//
+//   struct Counter {
+//     int value = 0;
+//   }
+//
+//   Eigen::ThreadLocal<Counter> counter(10);
+//
+//   // Each thread will have access to it's own counter object.
+//   Counter& cnt = counter.local();
+//   cnt++;
+//
+// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
+// std::this_thread::get_id() to identify threads. This value is not guaranteed
+// to be unique except for the life of the thread. A newly created thread may
+// get an OS-specific ID equal to that of an already destroyed thread.
+//
+// Somewhat similar to TBB thread local storage, with similar restrictions:
+// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
+//
+template <typename T,
+          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
+          typename Release = internal::ThreadLocalNoOpRelease<T>>
+class ThreadLocal {
+  // We preallocate default constructed elements in MaxSizedVector.
+  static_assert(std::is_default_constructible<T>::value,
+                "ThreadLocal data type must be default constructible");
+
+ public:
+  explicit ThreadLocal(int capacity)
+      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize)
+      : ThreadLocal(capacity, std::move(initialize),
+                    internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize, Release release)
+      : initialize_(std::move(initialize)),
+        release_(std::move(release)),
+        capacity_(capacity),
+        data_(capacity_),
+        ptr_(capacity_),
+        filled_records_(0) {
+    eigen_assert(capacity_ >= 0);
+    data_.resize(capacity_);
+    for (int i = 0; i < capacity_; ++i) {
+      ptr_.emplace_back(nullptr);
+    }
+  }
+
+  T& local() {
+    std::thread::id this_thread = std::this_thread::get_id();
+    if (capacity_ == 0) return SpilledLocal(this_thread);
+
+    std::size_t h = std::hash<std::thread::id>()(this_thread);
+    const int start_idx = h % capacity_;
+
+    // NOTE: From the definition of `std::this_thread::get_id()` it is
+    // guaranteed that we never can have concurrent insertions with the same key
+    // to our hash-map like data structure. If we didn't find an element during
+    // the initial traversal, it's guaranteed that no one else could have
+    // inserted it while we are in this function. This allows to massively
+    // simplify out lock-free insert-only hash map.
+
+    // Check if we already have an element for `this_thread`.
+    int idx = start_idx;
+    while (ptr_[idx].load() != nullptr) {
+      ThreadIdAndValue& record = *(ptr_[idx].load());
+      if (record.thread_id == this_thread) return record.value;
+
+      idx += 1;
+      if (idx >= capacity_) idx -= capacity_;
+      if (idx == start_idx) break;
+    }
+
+    // If we are here, it means that we found an insertion point in lookup
+    // table at `idx`, or we did a full traversal and table is full.
+
+    // If lock-free storage is full, fallback on mutex.
+    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
+
+    // We double check that we still have space to insert an element into a lock
+    // free storage. If old value in `filled_records_` is larger than the
+    // records capacity, it means that some other thread added an element while
+    // we were traversing lookup table.
+    int insertion_index =
+        filled_records_.fetch_add(1, std::memory_order_relaxed);
+    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
+
+    // At this point it's guaranteed that we can access to
+    // data_[insertion_index_] without a data race.
+    data_[insertion_index].thread_id = this_thread;
+    initialize_(data_[insertion_index].value);
+
+    // That's the pointer we'll put into the lookup table.
+    ThreadIdAndValue* inserted = &data_[insertion_index];
+
+    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
+    ThreadIdAndValue* empty = nullptr;
+
+    // Now we have to find an insertion point into the lookup table. We start
+    // from the `idx` that was identified as an insertion point above, it's
+    // guaranteed that we will have an empty record somewhere in a lookup table
+    // (because we created a record in the `data_`).
+    const int insertion_idx = idx;
+
+    do {
+      // Always start search from the original insertion candidate.
+      idx = insertion_idx;
+      while (ptr_[idx].load() != nullptr) {
+        idx += 1;
+        if (idx >= capacity_) idx -= capacity_;
+        // If we did a full loop, it means that we don't have any free entries
+        // in the lookup table, and this means that something is terribly wrong.
+        eigen_assert(idx != insertion_idx);
+      }
+      // Atomic CAS of the pointer guarantees that any other thread, that will
+      // follow this pointer will see all the mutations in the `data_`.
+    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
+
+    return inserted->value;
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  void ForEach(std::function<void(std::thread::id, T&)> f) {
+    // Reading directly from `data_` is unsafe, because only CAS to the
+    // record in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      f(record->thread_id, record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      f(kv.first, kv.second);
+    }
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  ~ThreadLocal() {
+    // Reading directly from `data_` is unsafe, because only CAS to the record
+    // in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      release_(record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      release_(kv.second);
+    }
+  }
+
+ private:
+  struct ThreadIdAndValue {
+    std::thread::id thread_id;
+    T value;
+  };
+
+  // Use unordered map guarded by a mutex when lock free storage is full.
+  T& SpilledLocal(std::thread::id this_thread) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    auto it = per_thread_map_.find(this_thread);
+    if (it == per_thread_map_.end()) {
+      auto result = per_thread_map_.emplace(this_thread, T());
+      eigen_assert(result.second);
+      initialize_((*result.first).second);
+      return (*result.first).second;
+    } else {
+      return it->second;
+    }
+  }
+
+  Initialize initialize_;
+  Release release_;
+  const int capacity_;
+
+  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
+  // storage contiguously starting from index 0.
+  MaxSizeVector<ThreadIdAndValue> data_;
+
+  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
+  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
+  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
+
+  // Number of records stored in the `data_`.
+  std::atomic<int> filled_records_;
+
+  // We fallback on per thread map if lock-free storage is full. In practice
+  // this should never happen, if `capacity_` is a reasonable estimate of the
+  // number of threads running in a system.
+  std::mutex mu_;  // Protects per_thread_map_.
+  std::unordered_map<std::thread::id, T> per_thread_map_;
+};
+
+}  // namespace Eigen
+
 #endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index a65ee97c9..25030dc0b 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -16,8 +16,23 @@ namespace Eigen {
 // custom thread pools underneath.
 class ThreadPoolInterface {
  public:
+  // Submits a closure to be run by a thread in the pool.
   virtual void Schedule(std::function<void()> fn) = 0;
 
+  // Submits a closure to be run by threads in the range [start, end) in the
+  // pool.
+  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
+                                int /*end*/) {
+    // Just defer to Schedule in case sub-classes aren't interested in
+    // overriding this functionality.
+    Schedule(fn);
+  }
+
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}
+
   // Returns the number of threads in the pool.
   virtual int NumThreads() const = 0;
 
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index ec27eddb8..149ceaff0 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -13,11 +13,6 @@
 #include <vector>
 #include "EmulateArray.h"
 
-// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
-// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
-// supports enough of the standard for our needs
-#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
-
 #include "CXX11Workarounds.h"
 
 namespace Eigen {
@@ -40,8 +35,9 @@ template<typename T, T... nn>
 struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
 
 template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
+struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 /* numeric list constructors
  *
  * equivalencies:
@@ -100,13 +96,14 @@ template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt..
 template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
 template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
 template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
+#endif //not EIGEN_PARSED_BY_DOXYGEN
 
 template<int n>
 struct h_skip {
   template<typename T, T... ii>
-  constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
   template<typename... tt>
-  constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
 };
 
 template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
@@ -123,6 +120,10 @@ template<typename a, typename... as>                      struct get<0, type_lis
 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
 template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
 
+template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
+   return get<(int)n, numeric_list<T, a, as...>>::value;
+}
+
 /* always get type, regardless of dummy; good for parameter pack expansion */
 
 template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
@@ -264,7 +265,7 @@ template<
   typename Reducer
 > struct reduce<Reducer>
 {
-  constexpr static inline int run() { return Reducer::Identity; }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
 };
 
 template<
@@ -272,7 +273,7 @@ template<
   typename A
 > struct reduce<Reducer, A>
 {
-  constexpr static inline A run(A a) { return a; }
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
 };
 
 template<
@@ -281,7 +282,7 @@ template<
   typename... Ts
 > struct reduce<Reducer, A, Ts...>
 {
-  constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
 };
@@ -289,29 +290,29 @@ template<
 /* generic binary operations */
 
 struct sum_op           {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
   static constexpr int Identity = 0;
 };
 struct product_op       {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
   static constexpr int Identity = 1;
 };
 
-struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
-struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
 
-struct equal_op         { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
-struct not_equal_op     { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
-struct lesser_op        { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
-struct lesser_equal_op  { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
-struct greater_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
-struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
 
 /* generic unary operations */
 
-struct not_op                { template<typename A> constexpr static inline auto run(A a) -> decltype(!a)      { return !a;      } };
-struct negation_op           { template<typename A> constexpr static inline auto run(A a) -> decltype(-a)      { return -a;      } };
-struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
 
 
 /* reductions for lists */
@@ -320,13 +321,13 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
 // does...
 template<typename... Ts>
-constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
 {
   return reduce<product_op, Ts...>::run(ts...);
 }
 
 template<typename... Ts>
-constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
 {
   return reduce<sum_op, Ts...>::run(ts...);
 }
@@ -334,13 +335,13 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts
 /* reverse arrays */
 
 template<typename Array, int... n>
-constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
 {
   return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 
 template<typename T, std::size_t N>
-constexpr inline array<T, N> array_reverse(array<T, N> arr)
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
 {
   return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
@@ -355,7 +356,7 @@ constexpr inline array<T, N> array_reverse(array<T, N> arr)
 // an infinite loop)
 template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
   {
     return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
   }
@@ -364,7 +365,7 @@ struct h_array_reduce {
 template<typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
   {
     return array_get<0>(arr);
   }
@@ -373,14 +374,14 @@ struct h_array_reduce<Reducer, T, N, 0>
 template<typename Reducer, typename T>
 struct h_array_reduce<Reducer, T, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
   {
     return identity;
   }
 };
 
 template<typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
 {
   return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
@@ -388,13 +389,13 @@ EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T i
 /* standard array reductions */
 
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
 {
   return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
 {
   return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
@@ -410,13 +411,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 /* zip an array */
 
 template<typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
   return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
 }
 
 template<typename Op, typename A, typename B, std::size_t N>
-constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
 {
   return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -424,13 +425,13 @@ constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, a
 /* zip an array and reduce the result */
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
 }
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -438,13 +439,13 @@ constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decl
 /* apply stuff to an array */
 
 template<typename Op, typename A, std::size_t N, int... n>
-constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
   return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
 }
 
 template<typename Op, typename A, std::size_t N>
-constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 {
   return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -452,13 +453,13 @@ constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 /* apply stuff to an array and reduce */
 
 template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
 }
 
 template<typename Reducer, typename Op, typename A, std::size_t N>
-constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -472,7 +473,7 @@ template<int n>
 struct h_repeat
 {
   template<typename t, int... ii>
-  constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
+  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
   {
     return {{ typename id_numeric<int, ii, t>::type(v)... }};
   }
@@ -533,10 +534,4 @@ InstType instantiate_by_c_array(ArrType* arr)
 
 } // end namespace Eigen
 
-#else // Non C++11, fallback to emulation mode
-
-#include "EmulateCXX11Meta.h"
-
-#endif
-
 #endif // EIGEN_CXX11META_H
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index fe4d22803..056736c39 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -32,7 +32,7 @@
  * On the other hand, visual studio still doesn't claim to support C++11 although it's
  * compliant enugh for our purpose.
  */
-#if (__cplusplus <= 199711L) && (EIGEN_COMP_MSVC < 1900)
+#if (EIGEN_COMP_CXXVER < 11)
 #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
 #pragma GCC diagnostic error "-Wfatal-errors"
 #endif
@@ -47,9 +47,9 @@ namespace internal {
  */
 
 
-template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
-template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
-template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
+template<std::size_t I_, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
 
 /* Suppose you have a template of the form
  * template<typename T> struct X;
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 30d3ebcff..834b20b55 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,15 +15,20 @@
 // The array class is only available starting with cxx11. Emulate our own here
 // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
 
 namespace Eigen {
 template <typename T, size_t n> class array {
  public:
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE T& front() { return values[0]; }
@@ -169,6 +174,7 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
     eigen_assert(l.size() == 0);
   }
 #endif
@@ -191,30 +197,26 @@ EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs)
 
 
 namespace internal {
-template<std::size_t I, class T, std::size_t N>
+template<std::size_t I_, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
-  return a[I];
+  return a[I_];
 }
-template<std::size_t I, class T, std::size_t N>
+template<std::size_t I_, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
-  return a[I];
+  return a[I_];
 }
 
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N> > {
-  static const size_t value = N;
+  enum { value = N };
 };
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N>& > {
-  static const size_t value = N;
+  enum { value = N };
 };
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
-  static const size_t value = N;
+  enum { value = N };
 };
-template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N>& > {
-  static const size_t value = N;
+  enum { value = N };
 };
 
 }  // end namespace internal
@@ -222,7 +224,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
 
 #else
 
-// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
 #include <array>
 namespace Eigen {
 
@@ -238,27 +240,19 @@ namespace internal {
  *                       this may not be constexpr
  */
 #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK             a._M_instance[I]
+#define STD_GET_ARR_HACK             a._M_instance[I_]
 #elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK             a.__elems_[I]
+#define STD_GET_ARR_HACK             a.__elems_[I_]
 #else
-#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
+#define STD_GET_ARR_HACK             std::template get<I_, T, N>(a)
 #endif
 
-template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
-template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
 #undef STD_GET_ARR_HACK
 
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
-  static const size_t value = N;
-};
-template <typename T> struct array_size;
-template<class T, std::size_t N> struct array_size<std::array<T,N> > {
-  static const size_t value = N;
-};
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h b/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
deleted file mode 100644
index f3aa1b144..000000000
--- a/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_EMULATE_CXX11_META_H
-#define EIGEN_EMULATE_CXX11_META_H
-
-
-
-namespace Eigen {
-
-namespace internal {
-
-/** \internal
-  * \file CXX11/util/EmulateCXX11Meta.h
-  * This file emulates a subset of the functionality provided by CXXMeta.h for
-  * compilers that don't yet support cxx11 such as nvcc.
-  */
-
-struct empty_list { static const std::size_t count = 0; };
-
-template<typename T, typename Tail=empty_list> struct type_list {
-  typedef T HeadType;
-  typedef Tail TailType;
-  static const T head;
-  static const Tail tail;
-  static const std::size_t count = 1 + Tail::count;
-};
-
-struct null_type { };
-
-template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
-         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
-         typename T7 = null_type, typename T8 = null_type>
-struct make_type_list {
-  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
-
-  typedef type_list<T1, tailresult> type;
-};
-
-template<> struct make_type_list<> {
-  typedef empty_list type;
-};
-
-
-template <std::size_t index, class TList> struct get_type;
-
-template <class Head, class Tail>
-struct get_type<0, type_list<Head, Tail> >
-{
-  typedef Head type;
-};
-
-template <std::size_t i, class Head, class Tail>
-struct get_type<i, type_list<Head, Tail> >
-{
-  typedef typename get_type<i-1, Tail>::type type;
-};
-
-
-/* numeric list */
-template <typename T, T n>
-struct type2val {
-  typedef T type;
-  static const T value = n;
-};
-
-
-template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
-  typedef typename make_type_list<type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V> >::type type;
-};
-
-template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
-  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
-                                  type2val<T, V>, type2val<T, V> >::type type;
-};
-
-
-template <std::size_t index, class NList> struct get;
-
-template <std::size_t i>
-struct get<i, empty_list>
-{
-  get() { eigen_assert(false && "index overflow"); }
-  typedef void type;
-  static const char value = '\0';
-};
-
-template <std::size_t i, class Head>
-struct get<i, type_list<Head, empty_list> >
-{
-  get() { eigen_assert(false && "index overflow"); }
-  typedef void type;
-  static const char value = '\0';
-};
-
-template <class Head>
-struct get<0, type_list<Head, empty_list> >
-{
-  typedef typename Head::type type;
-  static const type value = Head::value;
-};
-
-template <class Head, class Tail>
-struct get<0, type_list<Head, Tail> >
-{
-  typedef typename Head::type type;
-  static const type value = Head::value;
-};
-
-template <std::size_t i, class Head, class Tail>
-struct get<i, type_list<Head, Tail> >
-{
-  typedef typename Tail::HeadType::type type;
-  static const type value = get<i-1, Tail>::value;
-};
-
-
-template <class NList> struct arg_prod {
-  static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
-};
-template <> struct arg_prod<empty_list> {
-  static const int value = 1;
-};
-
-
-template<int n, typename t>
-array<t, n> repeat(t v) {
-  array<t, n> array;
-  array.fill(v);
-  return array;
-}
-
-template<std::size_t I, class Head, class Tail>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
-  return get<I, type_list<Head, Tail> >::value;
-}
-template<std::size_t I, class Head, class Tail>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
-  return get<I, type_list<Head, Tail> >::value;
-}
-
-template <class NList>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
-  return arg_prod<NList>::value;
-}
-
-template<typename t, std::size_t n>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
-  t prod = 1;
-  for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
-  return prod;
-}
-template<typename t>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
-  return 0;
-}
-
-template<typename t>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
-  eigen_assert(a.size() > 0);
-  t prod = 1;
-  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
-  return prod;
-}
-
-
-template<std::size_t I, class T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
-  return a[I];
-}
-template<std::size_t I, class T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
-  return a[I];
-}
-
-struct sum_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
-};
-struct product_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
-};
-
-struct logical_and_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
-};
-struct logical_or_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
-};
-
-struct equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
-};
-struct not_equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
-};
-struct lesser_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
-};
-struct lesser_equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
-};
-
-struct greater_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
-};
-struct greater_equal_op {
-  template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
-};
-
-struct not_op {
-  template<typename A> static inline bool run(A a) { return !a; }
-};
-struct negation_op {
-  template<typename A> static inline bool run(A a) { return -a; }
-};
-struct greater_equal_zero_op {
-  template<typename A> static inline bool run(A a) { return a >= 0; }
-};
-
-
-template<typename Reducer, typename Op, typename A, std::size_t N>
-struct ArrayApplyAndReduce {
-  static inline bool run(const array<A, N>& a) {
-    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
-    for (size_t i = 2; i < N; ++i) {
-      result = Reducer::run(result, Op::run(a[i]));
-    }
-    return result;
-  }
-};
-
-template<typename Reducer, typename Op, typename A>
-struct ArrayApplyAndReduce<Reducer, Op, A, 1>  {
-  static inline bool run(const array<A, 1>& a) {
-    return Op::run(a[0]);
-  }
-};
-
-template<typename Reducer, typename Op, typename A, std::size_t N>
-inline bool array_apply_and_reduce(const array<A, N>& a) {
-  return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
-}
-
-template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-struct ArrayZipAndReduce {
-  static inline bool run(const array<A, N>& a, const array<B, N>& b) {
-    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
-    for (size_t i = 2; i < N; ++i) {
-      result = Reducer::run(result, Op::run(a[i], b[i]));
-    }
-    return result;
-  }
-};
-
-template<typename Reducer, typename Op, typename A, typename B>
-struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
-  static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
-    return Op::run(a[0], b[0]);
-  }
-};
-
-template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
-  return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-
-
-#endif  // EIGEN_EMULATE_CXX11_META_H
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 4bc3dd1ba..277ab149a 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -29,13 +29,13 @@ namespace Eigen {
   */
 template <typename T>
 class MaxSizeVector {
+  static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
  public:
   // Construct a new MaxSizeVector, reserve n elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit MaxSizeVector(size_t n)
       : reserve_(n), size_(0),
-        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
-    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
   }
 
   // Construct a new MaxSizeVector, reserve and resize to n.
@@ -43,36 +43,56 @@ class MaxSizeVector {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MaxSizeVector(size_t n, const T& init)
       : reserve_(n), size_(n),
-        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
-    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
+        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+    size_t i = 0;
+    EIGEN_TRY
+    {
+      for(; i < size_; ++i) { new (&data_[i]) T(init); }
+    }
+    EIGEN_CATCH(...)
+    {
+      // Construction failed, destruct in reverse order:
+      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
+      internal::handmade_aligned_free(data_);
+      EIGEN_THROW;
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   ~MaxSizeVector() {
-    for (size_t i = 0; i < size_; ++i) {
-      data_[i].~T();
+    for (size_t i = size_; i > 0; --i) {
+      data_[i-1].~T();
     }
-    internal::aligned_free(data_);
+    internal::handmade_aligned_free(data_);
   }
 
   void resize(size_t n) {
     eigen_assert(n <= reserve_);
-    for (size_t i = size_; i < n; ++i) {
-      new (&data_[i]) T;
+    for (; size_ < n; ++size_) {
+      new (&data_[size_]) T;
     }
-    for (size_t i = n; i < size_; ++i) {
-      data_[i].~T();
+    for (; size_ > n; --size_) {
+      data_[size_-1].~T();
     }
-    size_ = n;
+    eigen_assert(size_ == n);
   }
 
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void push_back(const T& t) {
     eigen_assert(size_ < reserve_);
-    data_[size_++] = t;
+    new (&data_[size_++]) T(t);
   }
 
+  // For C++03 compatibility this only takes one argument
+  template<class X>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void emplace_back(const X& x) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(x);
+  }
+
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const T& operator[] (size_t i) const {
     eigen_assert(i < size_);
@@ -99,11 +119,8 @@ class MaxSizeVector {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void pop_back() {
-    // NOTE: This does not destroy the value at the end the way
-    // std::vector's version of pop_back() does.  That happens when
-    // the Vector is destroyed.
     eigen_assert(size_ > 0);
-    size_--;
+    data_[--size_].~T();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
index 521fa3f76..f8f1c5d0b 100644
--- a/unsupported/Eigen/EulerAngles
+++ b/unsupported/Eigen/EulerAngles
@@ -11,10 +11,10 @@
 #define EIGEN_EULERANGLES_MODULE_H
 
 
-#include "Eigen/Core"
-#include "Eigen/Geometry"
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
 
-#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 namespace Eigen {
 
@@ -38,6 +38,6 @@ namespace Eigen {
 #include "src/EulerAngles/EulerSystem.h"
 #include "src/EulerAngles/EulerAngles.h"
 
-#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index 2c45b3999..c8c311a60 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -13,7 +13,7 @@
 #include <complex>
 #include <vector>
 #include <map>
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 
 /**
@@ -68,6 +68,8 @@
   */
  
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #  include <fftw3.h>
@@ -129,8 +131,6 @@ protected:
   const T_SrcMat & m_src;
   T_FftIfc & m_ifc;
   Index m_nfft;
-private:
-  fft_fwd_proxy& operator=(const fft_fwd_proxy&);
 };
 
 template<typename T_SrcMat,typename T_FftIfc> 
@@ -149,8 +149,6 @@ protected:
   const T_SrcMat & m_src;
   T_FftIfc & m_ifc;
   Index m_nfft;
-private:
-  fft_inv_proxy& operator=(const fft_inv_proxy&);
 };
 
 
@@ -289,6 +287,7 @@ class FFT
     void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
     {
       typedef typename ComplexDerived::Scalar src_type;
+      typedef typename ComplexDerived::RealScalar real_type;
       typedef typename OutputDerived::Scalar dst_type;
       const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
@@ -329,9 +328,9 @@ class FFT
             tmp.head(nhead) = src.head(nhead);
             tmp.tail(ntail) = src.tail(ntail);
             if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
-              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*src_type(.5);
+              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*real_type(.5);
             }else{ // expanding -- split the old Nyquist bin into two halves
-              tmp(nhead) = src(nhead) * src_type(.5);
+              tmp(nhead) = src(nhead) * real_type(.5);
               tmp(tmp.size()-nhead) = tmp(nhead);
             }
           }
@@ -414,5 +413,7 @@ void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
 }
 
 }
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/IterativeSolvers b/unsupported/Eigen/IterativeSolvers
index 31e880bdc..a3f58d676 100644
--- a/unsupported/Eigen/IterativeSolvers
+++ b/unsupported/Eigen/IterativeSolvers
@@ -10,19 +10,28 @@
 #ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
 #define EIGEN_ITERATIVE_SOLVERS_MODULE_H
 
-#include <Eigen/Sparse>
+#include "../../Eigen/Sparse"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
+
 
 /**
-  * \defgroup IterativeSolvers_Module Iterative solvers module
+  * \defgroup IterativeLinearSolvers_Module Iterative solvers module
   * This module aims to provide various iterative linear and non linear solver algorithms.
   * It currently provides:
   *  - a constrained conjugate gradient
   *  - a Householder GMRES implementation
+  *  - an IDR(s) implementation
+  *  - a DGMRES implementation
+  *  - a MINRES implementation
+  *
   * \code
   * #include <unsupported/Eigen/IterativeSolvers>
   * \endcode
   */
-//@{
+
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #ifndef EIGEN_MPL2_ONLY
 #include "src/IterativeSolvers/IterationController.h"
@@ -30,13 +39,13 @@
 #endif
 
 #include "src/IterativeSolvers/IncompleteLU.h"
-#include "../../Eigen/Jacobi"
-#include "../../Eigen/Householder"
 #include "src/IterativeSolvers/GMRES.h"
 #include "src/IterativeSolvers/DGMRES.h"
 //#include "src/IterativeSolvers/SSORPreconditioner.h"
 #include "src/IterativeSolvers/MINRES.h"
+#include "src/IterativeSolvers/IDRS.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-//@}
 
 #endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/unsupported/Eigen/LevenbergMarquardt b/unsupported/Eigen/LevenbergMarquardt
index 0fe2680ba..109050501 100644
--- a/unsupported/Eigen/LevenbergMarquardt
+++ b/unsupported/Eigen/LevenbergMarquardt
@@ -12,12 +12,12 @@
 
 // #include <vector>
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
-#include <Eigen/QR>
-#include <unsupported/Eigen/NumericalDiff> 
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
 
-#include <Eigen/SparseQR>
+#include "../../Eigen/SparseQR"
 
 /**
   * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
@@ -29,7 +29,10 @@
   * 
   */
 
-#include "Eigen/SparseCore"
+#include "../../Eigen/SparseCore"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
 #include "src/LevenbergMarquardt/LMqrsolv.h"
@@ -41,5 +44,6 @@
 #include "src/LevenbergMarquardt/LevenbergMarquardt.h"
 #include "src/LevenbergMarquardt/LMonestep.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LEVENBERGMARQUARDT_MODULE
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 7f0b70c63..c4ea4ec5f 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -12,7 +12,7 @@
 #ifndef EIGEN_MPREALSUPPORT_MODULE_H
 #define EIGEN_MPREALSUPPORT_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 #include <mpreal.h>
 
 namespace Eigen {
@@ -90,6 +90,9 @@ int main()
 #ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
     static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
     static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
+    
+    static inline int digits ()               { return std::numeric_limits<Real>::digits(); }
+    static inline int digits (const Real& x)  { return std::numeric_limits<Real>::digits(x); }
 #endif
 
     static inline Real dummy_precision()
@@ -159,6 +162,7 @@ int main()
       typedef ResScalar LhsPacket;
       typedef ResScalar RhsPacket;
       typedef ResScalar ResPacket;
+      typedef LhsPacket LhsPacket4Packing;
       
     };
 
diff --git a/unsupported/Eigen/MatrixFunctions b/unsupported/Eigen/MatrixFunctions
index 0320606c1..20c23d1c5 100644
--- a/unsupported/Eigen/MatrixFunctions
+++ b/unsupported/Eigen/MatrixFunctions
@@ -14,9 +14,9 @@
 #include <cfloat>
 #include <list>
 
-#include <Eigen/Core>
-#include <Eigen/LU>
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/Core"
+#include "../../Eigen/LU"
+#include "../../Eigen/Eigenvalues"
 
 /**
   * \defgroup MatrixFunctions_Module Matrix functions module
@@ -53,12 +53,16 @@
   *
   */
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #include "src/MatrixFunctions/MatrixExponential.h"
 #include "src/MatrixFunctions/MatrixFunction.h"
 #include "src/MatrixFunctions/MatrixSquareRoot.h"
 #include "src/MatrixFunctions/MatrixLogarithm.h"
 #include "src/MatrixFunctions/MatrixPower.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 
 /** 
 \page matrixbaseextra_page
@@ -161,8 +165,8 @@ the z-axis.
 \include MatrixExponential.cpp
 Output: \verbinclude MatrixExponential.out
 
-\note \p M has to be a matrix of \c float, \c double, \c long double
-\c complex<float>, \c complex<double>, or \c complex<long double> .
+\note \p M has to be a matrix of \c float, \c double, `long double`
+\c complex<float>, \c complex<double>, or `complex<long double>` .
 
 
 \subsection matrixbase_log MatrixBase::log()
@@ -219,9 +223,8 @@ documentation of \ref matrixbase_exp "exp()".
 \include MatrixLogarithm.cpp
 Output: \verbinclude MatrixLogarithm.out
 
-\note \p M has to be a matrix of \c float, \c double, <tt>long
-double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
-double> .
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or `complex<long double>`.
 
 \sa MatrixBase::exp(), MatrixBase::matrixFunction(), 
     class MatrixLogarithmAtomic, MatrixBase::sqrt().
@@ -326,9 +329,9 @@ Example:
 \include MatrixPower_optimal.cpp
 Output: \verbinclude MatrixPower_optimal.out
 
-\note \p M has to be a matrix of \c float, \c double, <tt>long
-double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
-double> .
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or
+\c complex<long double> .
 
 \sa MatrixBase::exp(), MatrixBase::log(), class MatrixPower.
 
diff --git a/unsupported/Eigen/MoreVectorization b/unsupported/Eigen/MoreVectorization
index 470e72430..7662b4780 100644
--- a/unsupported/Eigen/MoreVectorization
+++ b/unsupported/Eigen/MoreVectorization
@@ -9,7 +9,7 @@
 #ifndef EIGEN_MOREVECTORIZATION_MODULE_H
 #define EIGEN_MOREVECTORIZATION_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
diff --git a/unsupported/Eigen/NonLinearOptimization b/unsupported/Eigen/NonLinearOptimization
index 600ab4c12..961f192b5 100644
--- a/unsupported/Eigen/NonLinearOptimization
+++ b/unsupported/Eigen/NonLinearOptimization
@@ -12,10 +12,10 @@
 
 #include <vector>
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
-#include <Eigen/QR>
-#include <unsupported/Eigen/NumericalDiff>
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
 
 /**
   * \defgroup NonLinearOptimization_Module Non linear optimization module
@@ -30,12 +30,12 @@
   * actually linear. But if this is so, you should probably better use other
   * methods more fitted to this special case.
   *
-  * One algorithm allows to find an extremum of such a system (Levenberg
-  * Marquardt algorithm) and the second one is used to find 
+  * One algorithm allows to find a least-squares solution of such a system
+  * (Levenberg-Marquardt algorithm) and the second one is used to find 
   * a zero for the system (Powell hybrid "dogleg" method).
   *
   * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
-  * Minpack is a very famous, old, robust and well-reknown package, written in 
+  * Minpack is a very famous, old, robust and well renowned package, written in
   * fortran. Those implementations have been carefully tuned, tested, and used
   * for several decades.
   *
@@ -58,35 +58,41 @@
   * There are two kinds of tests : those that come from examples bundled with cminpack.
   * They guaranty we get the same results as the original algorithms (value for 'x',
   * for the number of evaluations of the function, and for the number of evaluations
-  * of the jacobian if ever).
+  * of the Jacobian if ever).
   * 
   * Other tests were added by myself at the very beginning of the 
-  * process and check the results for levenberg-marquardt using the reference data 
+  * process and check the results for Levenberg-Marquardt using the reference data 
   * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
-  * carefully checked that the same results were obtained when modifiying the 
+  * carefully checked that the same results were obtained when modifying the
   * code. Please note that we do not always get the exact same decimals as they do,
   * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
   * which is 64 bits on most platforms (x86 and amd64, at least).
-  * I've performed those tests on several other implementations of levenberg-marquardt, and
+  * I've performed those tests on several other implementations of Levenberg-Marquardt, and
   * (c)minpack performs VERY well compared to those, both in accuracy and speed.
   * 
   * The documentation for running the tests is on the wiki
   * http://eigen.tuxfamily.org/index.php?title=Tests
   * 
-  * \section API API : overview of methods
+  * \section API API: overview of methods
   * 
-  * Both algorithms can use either the jacobian (provided by the user) or compute 
-  * an approximation by themselves (actually using Eigen \ref NumericalDiff_Module).
-  * The part of API referring to the latter use 'NumericalDiff' in the method names
-  * (exemple: LevenbergMarquardt.minimizeNumericalDiff() ) 
+  * Both algorithms needs a functor computing the Jacobian. It can be computed by
+  * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
+  * differences (see \ref NumericalDiff_Module). For instance:
+  *\code
+  * MyFunc func;
+  * NumericalDiff<MyFunc> func_with_num_diff(func);
+  * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
+  * \endcode
+  * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
+  * you.
   * 
   * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
   * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
   * minpack package that you probably should NOT use until you are porting a code that
-  *  was previously using minpack. They just define a 'simple' API with default values 
+  * was previously using minpack. They just define a 'simple' API with default values 
   * for some parameters.
   * 
-  * All algorithms are provided using Two APIs :
+  * All algorithms are provided using two APIs :
   *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
   * this way the caller have control over the steps
   *     - one where the user just calls a method (optimize() or solve()) which will 
@@ -94,7 +100,7 @@
   *  convenience.
   * 
   * As an example, the method LevenbergMarquardt::minimize() is 
-  * implemented as follow : 
+  * implemented as follow: 
   * \code
   * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
   * {
diff --git a/unsupported/Eigen/NumericalDiff b/unsupported/Eigen/NumericalDiff
index 433334ca8..0668f960f 100644
--- a/unsupported/Eigen/NumericalDiff
+++ b/unsupported/Eigen/NumericalDiff
@@ -10,7 +10,7 @@
 #ifndef EIGEN_NUMERICALDIFF_MODULE
 #define EIGEN_NUMERICALDIFF_MODULE
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
index 87f50947d..f8c213003 100644
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport
@@ -10,7 +10,7 @@
 #ifndef EIGEN_OPENGL_MODULE
 #define EIGEN_OPENGL_MODULE
 
-#include <Eigen/Geometry>
+#include "../../Eigen/Geometry"
 
 #if defined(__APPLE_CC__)
   #include <OpenGL/gl.h>
@@ -25,7 +25,7 @@ namespace Eigen {
   *
   * This module provides wrapper functions for a couple of OpenGL functions
   * which simplify the way to pass Eigen's object to openGL.
-  * Here is an exmaple:
+  * Here is an example:
   * 
   * \code
   * // You need to add path_to_eigen/unsupported to your include path.
@@ -184,7 +184,7 @@ inline void glRotate(const Rotation2D<float>& rot)
 }
 inline void glRotate(const Rotation2D<double>& rot)
 {
-  glRotated(rot.angle()*180.0/EIGEN_PI, 0.0, 0.0, 1.0);
+  glRotated(rot.angle()*180.0/double(EIGEN_PI), 0.0, 0.0, 1.0);
 }
 
 template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
diff --git a/unsupported/Eigen/Polynomials b/unsupported/Eigen/Polynomials
index cece56337..32ce2a2aa 100644
--- a/unsupported/Eigen/Polynomials
+++ b/unsupported/Eigen/Polynomials
@@ -9,11 +9,11 @@
 #ifndef EIGEN_POLYNOMIALS_MODULE_H
 #define EIGEN_POLYNOMIALS_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../Eigen/Eigenvalues"
 
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 // Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
 #if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
@@ -132,7 +132,6 @@
   Output: \verbinclude PolynomialSolver1.out
 */
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_POLYNOMIALS_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/Skyline b/unsupported/Eigen/Skyline
index 71a68cb42..ebdf143f7 100644
--- a/unsupported/Eigen/Skyline
+++ b/unsupported/Eigen/Skyline
@@ -10,9 +10,9 @@
 #define EIGEN_SKYLINE_MODULE_H
 
 
-#include "Eigen/Core"
+#include "../../Eigen/Core"
 
-#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #include <map>
 #include <cstdlib>
@@ -34,6 +34,6 @@
 #include "src/Skyline/SkylineInplaceLU.h"
 #include "src/Skyline/SkylineProduct.h"
 
-#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SKYLINE_MODULE_H
diff --git a/unsupported/Eigen/SparseExtra b/unsupported/Eigen/SparseExtra
index 819cffa27..ba5cbd661 100644
--- a/unsupported/Eigen/SparseExtra
+++ b/unsupported/Eigen/SparseExtra
@@ -24,6 +24,7 @@
 
 #ifdef EIGEN_GOOGLEHASH_SUPPORT
   #include <google/dense_hash_map>
+  #include <google/sparse_hash_map>
 #endif
 
 /**
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
index a2ad4925e..f6a2460e6 100644
--- a/unsupported/Eigen/SpecialFunctions
+++ b/unsupported/Eigen/SpecialFunctions
@@ -29,12 +29,29 @@ namespace Eigen {
   * - erfc
   * - lgamma
   * - igamma
+  * - igamma_der_a
+  * - gamma_sample_der_alpha
   * - igammac
   * - digamma
+  * - ndtri
   * - polygamma
   * - zeta
   * - betainc
   *
+  * Bessel Functions
+  * - bessel_i0
+  * - bessel_i0e
+  * - bessel_i1
+  * - bessel_i1e
+  * - bessel_j0
+  * - bessel_j1
+  * - bessel_k0
+  * - bessel_k0e
+  * - bessel_k1
+  * - bessel_k1e
+  * - bessel_y0
+  * - bessel_y1
+  *
   * \code
   * #include <unsupported/Eigen/SpecialFunctions>
   * \endcode
@@ -43,14 +60,37 @@ namespace Eigen {
 
 }
 
+#include "src/SpecialFunctions/BesselFunctionsImpl.h"
+#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
+#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
+#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
+#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
 #include "src/SpecialFunctions/SpecialFunctionsImpl.h"
-#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#if defined(EIGEN_HIPCC)
+#include "src/SpecialFunctions/HipVectorCompatibility.h"
+#endif
+#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
 #include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
 #include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
 #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
 
-#if defined EIGEN_VECTORIZE_CUDA
-  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512
+  #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+  #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+  #include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+  #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
 #endif
 
 namespace Eigen {
diff --git a/unsupported/Eigen/Splines b/unsupported/Eigen/Splines
index 322e6b9f5..2ca581364 100644
--- a/unsupported/Eigen/Splines
+++ b/unsupported/Eigen/Splines
@@ -24,8 +24,12 @@ namespace Eigen
   */
 }
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
 #include "src/Splines/SplineFwd.h"
 #include "src/Splines/Spline.h"
 #include "src/Splines/SplineFitting.h"
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif // EIGEN_SPLINES_MODULE_H
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 279fe5cd3..0f166e35f 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -26,11 +26,11 @@ void make_coherent(const A& a, const B&b)
   make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
 }
 
-template<typename _DerType, bool Enable> struct auto_diff_special_op;
+template<typename DerivativeType, bool Enable> struct auto_diff_special_op;
 
 } // end namespace internal
 
-template<typename _DerType> class AutoDiffScalar;
+template<typename DerivativeType> class AutoDiffScalar;
 
 template<typename NewDerType>
 inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
@@ -38,16 +38,16 @@ inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::
 }
 
 /** \class AutoDiffScalar
-  * \brief A scalar type replacement with automatic differentation capability
+  * \brief A scalar type replacement with automatic differentiation capability
   *
-  * \param _DerType the vector type used to store/represent the derivatives. The base scalar type
+  * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type
   *                 as well as the number of derivatives to compute are determined from this type.
   *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
   *                 if the number of derivatives is not known at compile time, and/or, the number
   *                 of derivatives is large.
-  *                 Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a
+  *                 Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a
   *                 existing vector into an AutoDiffScalar.
-  *                 Finally, _DerType can also be any Eigen compatible expression.
+  *                 Finally, DerivativeType can also be any Eigen compatible expression.
   *
   * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
   * template mechanism.
@@ -63,17 +63,17 @@ inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::
   *
   */
 
-template<typename _DerType>
+template<typename DerivativeType>
 class AutoDiffScalar
   : public internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                          typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value>
 {
   public:
     typedef internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                       typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> Base;
-    typedef typename internal::remove_all<_DerType>::type DerType;
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                       typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value> Base;
+    typedef typename internal::remove_all<DerivativeType>::type DerType;
     typedef typename internal::traits<DerType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real Real;
 
@@ -382,16 +382,16 @@ class AutoDiffScalar
 
 namespace internal {
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, true>
-//   : auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, true>
+//   : auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
 {
-  typedef typename remove_all<_DerType>::type DerType;
+  typedef typename remove_all<DerivativeType>::type DerType;
   typedef typename traits<DerType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-//   typedef auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+//   typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
 
 //   using Base::operator+;
@@ -401,8 +401,8 @@ struct auto_diff_special_op<_DerType, true>
 //   using Base::operator*;
 //   using Base::operator*=;
 
-  const AutoDiffScalar<_DerType>& derived() const { return *static_cast<const AutoDiffScalar<_DerType>*>(this); }
-  AutoDiffScalar<_DerType>& derived() { return *static_cast<AutoDiffScalar<_DerType>*>(this); }
+  const AutoDiffScalar<DerivativeType>& derived() const { return *static_cast<const AutoDiffScalar<DerivativeType>*>(this); }
+  AutoDiffScalar<DerivativeType>& derived() { return *static_cast<AutoDiffScalar<DerivativeType>*>(this); }
 
 
   inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
@@ -410,12 +410,12 @@ struct auto_diff_special_op<_DerType, true>
     return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
   }
 
-  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<_DerType>& b)
+  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<DerivativeType>& b)
   {
     return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
   }
 
-  inline AutoDiffScalar<_DerType>& operator+=(const Real& other)
+  inline AutoDiffScalar<DerivativeType>& operator+=(const Real& other)
   {
     derived().value() += other;
     return derived();
@@ -431,28 +431,46 @@ struct auto_diff_special_op<_DerType, true>
   }
 
   friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
-  operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
+  operator*(const Real& other, const AutoDiffScalar<DerivativeType>& a)
   {
     return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
       a.value() * other,
       a.derivatives() * other);
   }
 
-  inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other)
+  inline AutoDiffScalar<DerivativeType>& operator*=(const Scalar& other)
   {
     *this = *this * other;
     return derived();
   }
 };
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, false>
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, false>
 {
   void operator*() const;
   void operator-() const;
   void operator+() const;
 };
 
+template<typename BinOp, typename A, typename B, typename RefType>
+void make_coherent_expression(CwiseBinaryOp<BinOp,A,B> xpr, const RefType &ref)
+{
+  make_coherent(xpr.const_cast_derived().lhs(), ref);
+  make_coherent(xpr.const_cast_derived().rhs(), ref);
+}
+
+template<typename UnaryOp, typename A, typename RefType>
+void make_coherent_expression(const CwiseUnaryOp<UnaryOp,A> &xpr, const RefType &ref)
+{
+  make_coherent(xpr.nestedExpression().const_cast_derived(), ref);
+}
+
+// needed for compilation only
+template<typename UnaryOp, typename A, typename RefType>
+void make_coherent_expression(const CwiseNullaryOp<UnaryOp,A> &, const RefType &)
+{}
+
 template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols, typename B>
 struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>, B> {
   typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
@@ -462,6 +480,10 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows,
       a.resize(b.size());
       a.setZero();
     }
+    else if (B::SizeAtCompileTime==Dynamic && a.size()!=0 && b.size()==0)
+    {
+      make_coherent_expression(b,a);
+    }
   }
 };
 
@@ -474,13 +496,17 @@ struct make_coherent_impl<A, Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRo
       b.resize(a.size());
       b.setZero();
     }
+    else if (A::SizeAtCompileTime==Dynamic && b.size()!=0 && a.size()==0)
+    {
+      make_coherent_expression(a,b);
+    }
   }
 };
 
 template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols,
          typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
 struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,
-                             Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+                          Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
   typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
   typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
   static void run(A& a, B& b) {
@@ -534,42 +560,48 @@ struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, Bi
   EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
   FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
     using namespace Eigen; \
-    EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+    EIGEN_UNUSED_VARIABLE(sizeof(Scalar)); \
     CODE; \
   }
 
 template<typename DerType>
+struct CleanedUpDerType {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> type;
+};
+
+template<typename DerType>
 inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x)  { return x; }
 template<typename DerType>
 inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  { return x; }
 template<typename DerType>
 inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x <= y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x >= y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x < y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
+inline typename CleanedUpDerType<DerType>::type (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
   return (x > y ? ADS(x) : ADS(y));
 }
 template<typename DerType>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
   return (x.value() < y.value() ? x : y);
 }
 template<typename DerType>
-inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
   return (x.value() >= y.value() ? x : y);
 }
 
@@ -684,10 +716,15 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
 }
 
 namespace std {
+
 template <typename T>
 class numeric_limits<Eigen::AutoDiffScalar<T> >
   : public numeric_limits<typename T::Scalar> {};
 
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T&> >
+  : public numeric_limits<typename T::Scalar> {};
+
 }  // namespace std
 
 #endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/unsupported/Eigen/src/BVH/KdBVH.h b/unsupported/Eigen/src/BVH/KdBVH.h
index 1b8d75865..2d5b76ad0 100644
--- a/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/unsupported/Eigen/src/BVH/KdBVH.h
@@ -35,6 +35,7 @@ struct get_boxes_helper {
   {
     outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
     eigen_assert(outBoxes.size() == objects.size());
+    EIGEN_ONLY_USED_FOR_DEBUG(objects);
   }
 };
 
@@ -170,7 +171,7 @@ private:
   typedef internal::vector_int_pair<Scalar, Dim> VIPair;
   typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
   typedef Matrix<Scalar, Dim, 1> VectorType;
-  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
+  struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
   {
     VectorComparator(int inDim) : dim(inDim) {}
     inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 866a8a460..0fbd84772 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -3,29 +3,14 @@
 //
 // Copyright (C) 2012 David Harmon <dharmon@gmail.com>
 //
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 #define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 
-#include <Eigen/Dense>
+#include "../../../../Eigen/Dense"
 
 namespace Eigen { 
 
@@ -300,7 +285,7 @@ public:
 
   /** \brief Reports whether previous computation was successful.
    *
-   * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
    */
   ComputationInfo info() const
   {
diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
index 40af550e8..22088eb30 100644
--- a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
+++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -1,6 +1,6 @@
-FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
+file(GLOB Eigen_EulerAngles_SRCS "*.h")
 
-INSTALL(FILES
+install(FILES
   ${Eigen_EulerAngles_SRCS}
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
   )
diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
index 13a0da1ab..e43cdb7fb 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerAngles.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -12,11 +12,6 @@
 
 namespace Eigen
 {
-  /*template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
-  struct ei_eulerangles_assign_impl;*/
-
   /** \class EulerAngles
     *
     * \ingroup EulerAngles_Module
@@ -36,7 +31,7 @@ namespace Eigen
     * ### Rotation representation and conversions ###
     *
     * It has been proved(see Wikipedia link below) that every rotation can be represented
-    *  by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
+    *  by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
     * Therefore, you can convert from Eigen rotation and to them
     *  (including rotation matrices, which is not called "rotations" by Eigen design).
     *
@@ -55,33 +50,27 @@ namespace Eigen
     * Additionally, some axes related computation is done in compile time.
     *
     * #### Euler angles ranges in conversions ####
+    * Rotations representation as EulerAngles are not single (unlike matrices),
+    *  and even have infinite EulerAngles representations.<BR>
+    * For example, add or subtract 2*PI from either angle of EulerAngles
+    *  and you'll get the same rotation.
+    * This is the general reason for infinite representation,
+    *  but it's not the only general reason for not having a single representation.
     *
-    * When converting some rotation to Euler angles, there are some ways you can guarantee
-    *  the Euler angles ranges.
+    * When converting rotation to EulerAngles, this class convert it to specific ranges
+    * When converting some rotation to EulerAngles, the rules for ranges are as follow:
+    * - If the rotation we converting from is an EulerAngles
+    *  (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
+    * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+    *   As for Beta angle:
+    *    - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+    *    - otherwise:
+    *      - If the beta axis is positive, the beta angle will be in the range [0, PI]
+    *      - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
     *
-    * #### implicit ranges ####
-    * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
-    *  unless you convert from some other Euler angles.
-    * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
     * \sa EulerAngles(const MatrixBase<Derived>&)
     * \sa EulerAngles(const RotationBase<Derived, 3>&)
     *
-    * #### explicit ranges ####
-    * When using explicit ranges, all angles are guarantee to be in the range you choose.
-    * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
-    * - _true_ - force the range between [0, +2*PI]
-    * - _false_ - force the range between [-PI, +PI]
-    *
-    * ##### compile time ranges #####
-    * This is when you have compile time ranges and you prefer to
-    *  use template parameter. (e.g. for performance)
-    * \sa FromRotation()
-    *
-    * ##### run-time time ranges #####
-    * Run-time ranges are also supported.
-    * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
-    * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
-    *
     * ### Convenient user typedefs ###
     *
     * Convenient typedefs for EulerAngles exist for float and double scalar,
@@ -103,7 +92,7 @@ namespace Eigen
     *
     * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
     *
-    * \tparam _Scalar the scalar type, i.e., the type of the angles.
+    * \tparam _Scalar the scalar type, i.e. the type of the angles.
     *
     * \tparam _System the EulerSystem to use, which represents the axes of rotation.
     */
@@ -111,8 +100,11 @@ namespace Eigen
   class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
   {
     public:
+      typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base;
+      
       /** the scalar type of the angles */
       typedef _Scalar Scalar;
+      typedef typename NumTraits<Scalar>::Real RealScalar;
       
       /** the EulerSystem to use, which represents the axes of rotation. */
       typedef _System System;
@@ -146,67 +138,56 @@ namespace Eigen
     public:
       /** Default constructor without initialization. */
       EulerAngles() {}
-      /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
+      /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
       EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
         m_angles(alpha, beta, gamma) {}
       
-      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
-        *
-        * \note All angles will be in the range [-PI, PI].
-      */
-      template<typename Derived>
-      EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+      // TODO: Test this constructor
+      /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
+      explicit EulerAngles(const Scalar* data) : m_angles(data) {}
       
-      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
-        *  with options to choose for each angle the requested range.
-        *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
+      /** Constructs and initializes an EulerAngles from either:
+        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+        *  - a 3D vector expression representing Euler angles.
         *
-        * \param m The 3x3 rotation matrix to convert
-        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-      */
+        * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
+        *  Alpha and gamma angles will be in the range [-PI, PI].<BR>
+        *  As for Beta angle:
+        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+        *   - otherwise:
+        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+       */
       template<typename Derived>
-      EulerAngles(
-        const MatrixBase<Derived>& m,
-        bool positiveRangeAlpha,
-        bool positiveRangeBeta,
-        bool positiveRangeGamma) {
-        
-        System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
-      }
+      explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; }
       
       /** Constructs and initialize Euler angles from a rotation \p rot.
         *
-        * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
-        *  If rot is an EulerAngles, expected EulerAngles range is __undefined__.
-        *  (Use other functions here for enforcing range if this effect is desired)
+        * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
+        *  angles ranges are __undefined__.
+        *  Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+        *  As for Beta angle:
+        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+        *   - otherwise:
+        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
       */
       template<typename Derived>
-      EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
+      EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); }
       
-      /** Constructs and initialize Euler angles from a rotation \p rot,
-        *  with options to choose for each angle the requested range.
-        *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
-        *
-        * \param rot The 3x3 rotation matrix to convert
-        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-      */
-      template<typename Derived>
-      EulerAngles(
-        const RotationBase<Derived, 3>& rot,
-        bool positiveRangeAlpha,
-        bool positiveRangeBeta,
-        bool positiveRangeGamma) {
-        
-        System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
-      }
+      /*EulerAngles(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
 
       /** \returns The angle values stored in a vector (alpha, beta, gamma). */
       const Vector3& angles() const { return m_angles; }
@@ -246,90 +227,48 @@ namespace Eigen
         return inverse();
       }
       
-      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
-        *  with options to choose for each angle the requested range (__only in compile time__).
+      /** Set \c *this from either:
+        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+        *  - a 3D vector expression representing Euler angles.
         *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
-        *
-        * \param m The 3x3 rotation matrix to convert
-        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        */
-      template<
-        bool PositiveRangeAlpha,
-        bool PositiveRangeBeta,
-        bool PositiveRangeGamma,
-        typename Derived>
-      static EulerAngles FromRotation(const MatrixBase<Derived>& m)
-      {
-        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
-        
-        EulerAngles e;
-        System::template CalcEulerAngles<
-          PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
-        return e;
-      }
-      
-      /** Constructs and initialize Euler angles from a rotation \p rot,
-        *  with options to choose for each angle the requested range (__only in compile time__).
-        *
-        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
-        * Otherwise, the specified angle will be in the range [-PI, +PI].
-        *
-        * \param rot The 3x3 rotation matrix to convert
-        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
-        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
+        *  angles ranges output.
       */
-      template<
-        bool PositiveRangeAlpha,
-        bool PositiveRangeBeta,
-        bool PositiveRangeGamma,
-        typename Derived>
-      static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
-      {
-        return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
-      }
-      
-      /*EulerAngles& fromQuaternion(const QuaternionType& q)
+      template<class Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& other)
       {
-        // TODO: Implement it in a faster way for quaternions
-        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
-        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
-        // Currently we compute all matrix cells from quaternion.
-
-        // Special case only for ZYX
-        //Scalar y2 = q.y() * q.y();
-        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
-        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
-        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
-      }*/
-      
-      /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
-      template<typename Derived>
-      EulerAngles& operator=(const MatrixBase<Derived>& m) {
-        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value),
+         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
         
-        System::CalcEulerAngles(*this, m);
+        internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
         return *this;
       }
 
       // TODO: Assign and construct from another EulerAngles (with different system)
       
-      /** Set \c *this from a rotation. */
+      /** Set \c *this from a rotation.
+        *
+        * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
+        *  angles ranges output.
+      */
       template<typename Derived>
       EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
         System::CalcEulerAngles(*this, rot.toRotationMatrix());
         return *this;
       }
       
-      // TODO: Support isApprox function
+      /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+        * determined by \a prec.
+        *
+        * \sa MatrixBase::isApprox() */
+      bool isApprox(const EulerAngles& other,
+        const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+      { return angles().isApprox(other.angles(), prec); }
 
       /** \returns an equivalent 3x3 rotation matrix. */
       Matrix3 toRotationMatrix() const
       {
+        // TODO: Calc it faster
         return static_cast<QuaternionType>(*this).toRotationMatrix();
       }
 
@@ -347,6 +286,15 @@ namespace Eigen
         s << eulerAngles.angles().transpose();
         return s;
       }
+      
+      /** \returns \c *this with scalar type casted to \a NewScalarType */
+      template <typename NewScalarType>
+      EulerAngles<NewScalarType, System> cast() const
+      {
+        EulerAngles<NewScalarType, System> e;
+        e.angles() = angles().template cast<NewScalarType>();
+        return e;
+      }
   };
 
 #define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
@@ -379,8 +327,29 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
     {
       typedef _Scalar Scalar;
     };
+    
+    // set from a rotation matrix
+    template<class System, class Other>
+    struct eulerangles_assign_impl<System,Other,3,3>
+    {
+      typedef typename Other::Scalar Scalar;
+      static void run(EulerAngles<Scalar, System>& e, const Other& m)
+      {
+        System::CalcEulerAngles(e, m);
+      }
+    };
+    
+    // set from a vector of Euler angles
+    template<class System, class Other>
+    struct eulerangles_assign_impl<System,Other,3,1>
+    {
+      typedef typename Other::Scalar Scalar;
+      static void run(EulerAngles<Scalar, System>& e, const Other& vec)
+      {
+        e.angles() = vec;
+      }
+    };
   }
-  
 }
 
 #endif // EIGEN_EULERANGLESCLASS_H
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 98f9f647d..2a833b0a4 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -12,13 +12,13 @@
 
 namespace Eigen
 {
-  // Forward declerations
+  // Forward declarations
   template <typename _Scalar, class _System>
   class EulerAngles;
   
   namespace internal
   {
-    // TODO: Check if already exists on the rest API
+    // TODO: Add this trait to the Eigen internal API?
     template <int Num, bool IsPositive = (Num > 0)>
     struct Abs
     {
@@ -36,6 +36,12 @@ namespace Eigen
     {
       enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
     };
+    
+    template<typename System,
+            typename Other,
+            int OtherRows=Other::RowsAtCompileTime,
+            int OtherCols=Other::ColsAtCompileTime>
+    struct eulerangles_assign_impl;
   }
   
   #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
@@ -69,7 +75,7 @@ namespace Eigen
     *
     * You can use this class to get two things:
     *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
-    *  - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
     *
     * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
     * This meta-class store constantly those signed axes. (see \ref EulerAxis)
@@ -80,7 +86,7 @@ namespace Eigen
     *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
     *  - all axes X, Y, Z in each valid order (see below what order is valid)
     *  - rotation over the axis is supported both over the positive and negative directions.
-    *  - both tait bryan and proper/classic Euler angles (i.e. the opposite).
+    *  - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
     *
     * Since EulerSystem support both positive and negative directions,
     *  you may call this rotation distinction in other names:
@@ -90,7 +96,7 @@ namespace Eigen
     * Notice all axed combination are valid, and would trigger a static assertion.
     * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
     * This yield two and only two classes:
-    *  - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
     *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
     *     and the second is different, e.g. {X,Y,X}
     *
@@ -112,9 +118,9 @@ namespace Eigen
     *
     * \tparam _AlphaAxis the first fixed EulerAxis
     *
-    * \tparam _AlphaAxis the second fixed EulerAxis
+    * \tparam _BetaAxis the second fixed EulerAxis
     *
-    * \tparam _AlphaAxis the third fixed EulerAxis
+    * \tparam _GammaAxis the third fixed EulerAxis
     */
   template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
   class EulerSystem
@@ -138,14 +144,16 @@ namespace Eigen
       BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
       GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
       
-      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
-      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
-      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
-      
-      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
-      IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
+
+      // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
+      // by Z, or Z is followed by X; otherwise it is odd.
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */
 
-      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
     };
     
     private:
@@ -165,86 +173,84 @@ namespace Eigen
     EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
       BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
 
-    enum
-    {
+    static const int
       // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
       // They are used in this class converters.
       // They are always different from each other, and their possible values are: 0, 1, or 2.
-      I = AlphaAxisAbs - 1,
-      J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
-      K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
-    };
+      I_ = AlphaAxisAbs - 1,
+      J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    ;
     
     // TODO: Get @mat parameter in form that avoids double evaluation.
     template <typename Derived>
     static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
     {
       using std::atan2;
-      using std::sin;
-      using std::cos;
+      using std::sqrt;
       
       typedef typename Derived::Scalar Scalar;
-      typedef Matrix<Scalar,2,1> Vector2;
-      
-      res[0] = atan2(mat(J,K), mat(K,K));
-      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
-      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
-        if(res[0] > Scalar(0)) {
-          res[0] -= Scalar(EIGEN_PI);
-        }
-        else {
-          res[0] += Scalar(EIGEN_PI);
-        }
-        res[1] = atan2(-mat(I,K), -c2);
+
+      const Scalar plusMinus = IsEven? 1 : -1;
+      const Scalar minusPlus = IsOdd?  1 : -1;
+
+      const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2);
+      res[1] = atan2(plusMinus * mat(I_,K_), Rsum);
+
+      // There is a singularity when cos(beta) == 0
+      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0
+        res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_));
+        res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_));
+      }
+      else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1
+        Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma
+        Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma)
+        Scalar alphaPlusMinusGamma = atan2(spos, cpos);
+        res[0] = alphaPlusMinusGamma;
+        res[2] = 0;
+      }
+      else {// cos(beta) == 0 and sin(beta) == -1
+        Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma)
+        Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_);               // 2*cos(alpha + minusPlus*gamma)
+        Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
+        res[0] = alphaMinusPlusBeta;
+        res[2] = 0;
       }
-      else
-        res[1] = atan2(-mat(I,K), c2);
-      Scalar s1 = sin(res[0]);
-      Scalar c1 = cos(res[0]);
-      res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
     }
 
     template <typename Derived>
-    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res,
+                                    const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
     {
       using std::atan2;
-      using std::sin;
-      using std::cos;
+      using std::sqrt;
 
       typedef typename Derived::Scalar Scalar;
-      typedef Matrix<Scalar,2,1> Vector2;
-      
-      res[0] = atan2(mat(J,I), mat(K,I));
-      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
-      {
-        if(res[0] > Scalar(0)) {
-          res[0] -= Scalar(EIGEN_PI);
-        }
-        else {
-          res[0] += Scalar(EIGEN_PI);
-        }
-        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
-        res[1] = -atan2(s2, mat(I,I));
-      }
-      else
-      {
-        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
-        res[1] = atan2(s2, mat(I,I));
-      }
 
-      // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
-      // we can compute their respective rotation, and apply its inverse to M. Since the result must
-      // be a rotation around x, we have:
-      //
-      //  c2  s1.s2 c1.s2                   1  0   0 
-      //  0   c1    -s1       *    M    =   0  c3  s3
-      //  -s2 s1.c2 c1.c2                   0 -s3  c3
-      //
-      //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+      const Scalar plusMinus = IsEven? 1 : -1;
+      const Scalar minusPlus = IsOdd?  1 : -1;
+
+      const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2);
 
-      Scalar s1 = sin(res[0]);
-      Scalar c1 = cos(res[0]);
-      res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
+      res[1] = atan2(Rsum, mat(I_, I_));
+
+      // There is a singularity when sin(beta) == 0
+      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0
+        res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_));
+        res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_));
+      }
+      else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1
+        Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma)
+        Scalar cpos = mat(J_, J_) + mat(K_, K_);                         // 2*cos(alpha + gamma)
+        res[0] = atan2(spos, cpos);
+        res[2] = 0;
+      }
+      else {// sin(beta) == 0 and cos(beta) == -1
+        Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma)
+        Scalar cneg = mat(J_, J_) - mat(K_, K_);                         // 2*cos(alpha - gamma)
+        res[0] = atan2(sneg, cneg);
+        res[2] = 0;
+      }
     }
     
     template<typename Scalar>
@@ -252,55 +258,28 @@ namespace Eigen
       EulerAngles<Scalar, EulerSystem>& res,
       const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
     {
-      CalcEulerAngles(res, mat, false, false, false);
-    }
-    
-    template<
-      bool PositiveRangeAlpha,
-      bool PositiveRangeBeta,
-      bool PositiveRangeGamma,
-      typename Scalar>
-    static void CalcEulerAngles(
-      EulerAngles<Scalar, EulerSystem>& res,
-      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
-    {
-      CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
-    }
-    
-    template<typename Scalar>
-    static void CalcEulerAngles(
-      EulerAngles<Scalar, EulerSystem>& res,
-      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
-      bool PositiveRangeAlpha,
-      bool PositiveRangeBeta,
-      bool PositiveRangeGamma)
-    {
       CalcEulerAngles_imp(
         res.angles(), mat,
         typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
 
-      if (IsAlphaOpposite == IsOdd)
+      if (IsAlphaOpposite)
         res.alpha() = -res.alpha();
         
-      if (IsBetaOpposite == IsOdd)
+      if (IsBetaOpposite)
         res.beta() = -res.beta();
         
-      if (IsGammaOpposite == IsOdd)
+      if (IsGammaOpposite)
         res.gamma() = -res.gamma();
-      
-      // Saturate results to the requested range
-      if (PositiveRangeAlpha && (res.alpha() < 0))
-        res.alpha() += Scalar(2 * EIGEN_PI);
-      
-      if (PositiveRangeBeta && (res.beta() < 0))
-        res.beta() += Scalar(2 * EIGEN_PI);
-      
-      if (PositiveRangeGamma && (res.gamma() < 0))
-        res.gamma() += Scalar(2 * EIGEN_PI);
     }
     
     template <typename _Scalar, class _System>
     friend class Eigen::EulerAngles;
+    
+    template<typename System,
+            typename Other,
+            int OtherRows,
+            int OtherCols>
+    friend struct internal::eulerangles_assign_impl;
   };
 
 #define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
diff --git a/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
index d49aa17f5..1c2cd24a0 100644
--- a/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_fftw_impl.h
@@ -231,6 +231,8 @@ namespace internal {
   protected:
       typedef fftw_plan<Scalar> PlanData;
 
+      typedef Eigen::numext::int64_t int64_t;
+
       typedef std::map<int64_t,PlanData> PlanMap;
 
       PlanMap m_plans;
@@ -257,5 +259,3 @@ namespace internal {
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index be51b4e6f..430953aee 100644
--- a/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -25,16 +25,47 @@ struct kiss_cpx_fft
   std::vector<Complex> m_scratchBuf;
   bool m_inverse;
 
-  inline
-    void make_twiddles(int nfft,bool inverse)
+  inline void make_twiddles(int nfft, bool inverse)
+  {
+    using numext::sin;
+    using numext::cos;
+    m_inverse = inverse;
+    m_twiddles.resize(nfft);
+    double phinc =  0.25 * double(EIGEN_PI) / nfft;
+    Scalar flip = inverse ? Scalar(1) : Scalar(-1);
+    m_twiddles[0] = Complex(Scalar(1), Scalar(0));
+    if ((nfft&1)==0)
+      m_twiddles[nfft/2] = Complex(Scalar(-1), Scalar(0));
+    int i=1;
+    for (;i*8<nfft;++i)
     {
-      using std::acos;
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
+      Scalar c = Scalar(cos(i*8*phinc));
+      Scalar s = Scalar(sin(i*8*phinc));
+      m_twiddles[i] = Complex(c, s*flip);
+      m_twiddles[nfft-i] = Complex(c, -s*flip);
     }
+    for (;i*4<nfft;++i)
+    {
+      Scalar c = Scalar(cos((2*nfft-8*i)*phinc));
+      Scalar s = Scalar(sin((2*nfft-8*i)*phinc));
+      m_twiddles[i] = Complex(s, c*flip);
+      m_twiddles[nfft-i] = Complex(s, -c*flip);
+    }
+    for (;i*8<3*nfft;++i)
+    {
+      Scalar c = Scalar(cos((8*i-2*nfft)*phinc));
+      Scalar s = Scalar(sin((8*i-2*nfft)*phinc));
+      m_twiddles[i] = Complex(-s, c*flip);
+      m_twiddles[nfft-i] = Complex(-s, -c*flip);
+    }
+    for (;i*2<nfft;++i)
+    {
+      Scalar c = Scalar(cos((4*nfft-8*i)*phinc));
+      Scalar s = Scalar(sin((4*nfft-8*i)*phinc));
+      m_twiddles[i] = Complex(-c, s*flip);
+      m_twiddles[nfft-i] = Complex(-c, -s*flip);
+    }
+  }
 
   void factorize(int nfft)
   {
@@ -316,8 +347,8 @@ struct kissfft_impl
 
         // use optimized mode for even real
         fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc = dst[0].real() +  dst[0].imag();
-        Complex nyquist = dst[0].real() -  dst[0].imag();
+        Complex dc(dst[0].real() +  dst[0].imag());
+        Complex nyquist(dst[0].real() -  dst[0].imag());
         int k;
         for ( k=1;k <= ncfft2 ; ++k ) {
           Complex fpk = dst[k];
@@ -416,5 +447,3 @@ struct kissfft_impl
 } // end namespace internal
 
 } // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
index dc0093eb9..e7d70f39d 100644
--- a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+++ b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
@@ -31,13 +31,13 @@
 #ifndef EIGEN_CONSTRAINEDCG_H
 #define EIGEN_CONSTRAINEDCG_H
 
-#include <Eigen/Core>
+#include "../../../../Eigen/Core"
 
 namespace Eigen { 
 
 namespace internal {
 
-/** \ingroup IterativeSolvers_Module
+/** \ingroup IterativeLinearSolvers_Module
   * Compute the pseudo inverse of the non-square matrix C such that
   * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method.
   *
@@ -96,10 +96,10 @@ void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
 
 
 
-/** \ingroup IterativeSolvers_Module
+/** \ingroup IterativeLinearSolvers_Module
   * Constrained conjugate gradient
   *
-  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
   */
 template<typename TMatrix, typename CMatrix,
          typename VectorX, typename VectorB, typename VectorF>
@@ -158,8 +158,6 @@ void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x,
     rho = r.dot(z);
 
     if (iter.finished(rho)) break;
-
-    if (iter.noiseLevel() > 0 && transition) std::cerr << "CCG: transition\n";
     if (transition || iter.first()) gamma = 0.0;
     else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1);
     p = z + gamma*p;
diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index bae04fc30..5ae011b75 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_DGMRES_H
 #define EIGEN_DGMRES_H
 
-#include <Eigen/Eigenvalues>
+#include "../../../../Eigen/Eigenvalues"
 
 namespace Eigen { 
   
@@ -39,7 +39,6 @@ template <typename VectorType, typename IndexType>
 void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
 {
   eigen_assert(vec.size() == perm.size());
-  typedef typename IndexType::Scalar Index; 
   bool flag; 
   for (Index k  = 0; k < ncut; k++)
   {
@@ -58,7 +57,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
 
 }
 /**
- * \ingroup IterativeLInearSolvers_Module
+ * \ingroup IterativeLinearSolvers_Module
  * \brief A Restarted GMRES with deflation.
  * This class implements a modification of the GMRES solver for
  * sparse linear systems. The basis is built with modified 
@@ -89,7 +88,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
  *  Flows, Computers and Fluids, In Press,
- *  http://dx.doi.org/10.1016/j.compfluid.2012.03.023   
+ *  https://doi.org/10.1016/j.compfluid.2012.03.023   
  * [2] K. Burrage and J. Erhel, On the performance of various 
  * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
  * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
@@ -110,9 +109,9 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     using Base::m_tolerance; 
   public:
     using Base::_solve_impl;
+    using Base::_solve_with_guess_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef _Preconditioner Preconditioner;
@@ -143,44 +142,30 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
-  {    
-    bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      dgmres(matrix(), b.col(j), xj, Base::m_preconditioner);
-    }
-    m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
-    m_isInitialized = true;
-  }
-
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const Rhs& b, MatrixBase<Dest>& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
-    x = b;
-    _solve_with_guess_impl(b,x.derived());
+    EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+    
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    
+    dgmres(matrix(), b, x, Base::m_preconditioner);
   }
+
   /** 
    * Get the restart value
     */
-  int restart() { return m_restart; }
+  Index restart() { return m_restart; }
   
   /** 
    * Set the restart value (default is 30)  
    */
-  void set_restart(const int restart) { m_restart=restart; }
+  void set_restart(const Index restart) { m_restart=restart; }
   
   /** 
    * Set the number of eigenvalues to deflate at each restart 
    */
-  void setEigenv(const int neig) 
+  void setEigenv(const Index neig) 
   {
     m_neig = neig;
     if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
@@ -189,12 +174,12 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   /** 
    * Get the size of the deflation subspace size
    */ 
-  int deflSize() {return m_r; }
+  Index deflSize() {return m_r; }
   
   /**
    * Set the maximum size of the deflation subspace
    */
-  void setMaxEigenv(const int maxNeig) { m_maxNeig = maxNeig; }
+  void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; }
   
   protected:
     // DGMRES algorithm 
@@ -202,27 +187,27 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
     // Perform one cycle of GMRES
     template<typename Dest>
-    int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
+    Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const; 
     // Compute data to use for deflation 
-    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
+    Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
     // Apply deflation to a vector
     template<typename RhsType, typename DestType>
-    int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
+    Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
     ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
     ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
     // Init data for deflation
     void dgmresInitDeflation(Index& rows) const; 
     mutable DenseMatrix m_V; // Krylov basis vectors
     mutable DenseMatrix m_H; // Hessenberg matrix 
-    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
     mutable Index m_restart; // Maximum size of the Krylov subspace
     mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
     mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
     mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
     mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
-    mutable int m_r; // Current number of deflated eigenvalues, size of m_U
-    mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
+    mutable Index m_r; // Current number of deflated eigenvalues, size of m_U
+    mutable Index m_maxNeig; // Maximum number of eigenvalues to deflate
     mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
     mutable bool m_isDeflAllocated;
     mutable bool m_isDeflInitialized;
@@ -243,18 +228,30 @@ template<typename Rhs, typename Dest>
 void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
               const Preconditioner& precond) const
 {
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  RealScalar normRhs = rhs.norm();
+  if(normRhs <= considerAsZero) 
+  {
+    x.setZero();
+    m_error = 0;
+    return;
+  }
+
   //Initialization
-  int n = mat.rows(); 
+  m_isDeflInitialized = false;
+  Index n = mat.rows(); 
   DenseVector r0(n); 
-  int nbIts = 0; 
+  Index nbIts = 0; 
   m_H.resize(m_restart+1, m_restart);
   m_Hes.resize(m_restart, m_restart);
   m_V.resize(n,m_restart+1);
-  //Initial residual vector and intial norm
-  x = precond.solve(x);
+  //Initial residual vector and initial norm
+  if(x.squaredNorm()==0) 
+    x = precond.solve(rhs);
   r0 = rhs - mat * x; 
   RealScalar beta = r0.norm(); 
-  RealScalar normRhs = rhs.norm();
+  
   m_error = beta/normRhs; 
   if(m_error < m_tolerance)
     m_info = Success; 
@@ -267,8 +264,10 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
     dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); 
     
     // Compute the new residual vector for the restart 
-    if (nbIts < m_iterations && m_info == NoConvergence)
-      r0 = rhs - mat * x; 
+    if (nbIts < m_iterations && m_info == NoConvergence) {
+      r0 = rhs - mat * x;
+      beta = r0.norm();
+    }
   }
 } 
 
@@ -284,7 +283,7 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
  */
 template< typename _MatrixType, typename _Preconditioner>
 template<typename Dest>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const
 {
   //Initialization 
   DenseVector g(m_restart+1); // Right hand side of the least square problem
@@ -293,8 +292,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
   m_V.col(0) = r0/beta; 
   m_info = NoConvergence; 
   std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
-  int it = 0; // Number of inner iterations 
-  int n = mat.rows();
+  Index it = 0; // Number of inner iterations 
+  Index n = mat.rows();
   DenseVector tv1(n), tv2(n);  //Temporary vectors
   while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
   {    
@@ -312,7 +311,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
    
     // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
     Scalar coef; 
-    for (int i = 0; i <= it; ++i)
+    for (Index i = 0; i <= it; ++i)
     { 
       coef = tv1.dot(m_V.col(i));
       tv1 = tv1 - coef * m_V.col(i); 
@@ -328,7 +327,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, con
     // FIXME Check for happy breakdown 
     
     // Update Hessenberg matrix with Givens rotations
-    for (int i = 1; i <= it; ++i) 
+    for (Index i = 1; i <= it; ++i) 
     {
       m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
     }
@@ -394,7 +393,6 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr
 template< typename _MatrixType, typename _Preconditioner>
 inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur<DenseMatrix>& schurofH) const
 {
-  typedef typename MatrixType::Index Index;
   const DenseMatrix& T = schurofH.matrixT();
   Index it = T.rows();
   ComplexVector eig(it);
@@ -418,7 +416,7 @@ inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_Matr
 }
 
 template< typename _MatrixType, typename _Preconditioner>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
 {
   // First, find the Schur form of the Hessenberg matrix H
   typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
@@ -433,8 +431,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   
   // Reorder the absolute values of Schur values
   DenseRealVector modulEig(it); 
-  for (int j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
-  perm.setLinSpaced(it,0,it-1);
+  for (Index j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
+  perm.setLinSpaced(it,0,internal::convert_index<StorageIndex>(it-1));
   internal::sortWithPermutation(modulEig, perm, neig);
   
   if (!m_lambdaN)
@@ -442,7 +440,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
     m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
   }
   //Count the real number of extracted eigenvalues (with complex conjugates)
-  int nbrEig = 0; 
+  Index nbrEig = 0; 
   while (nbrEig < neig)
   {
     if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; 
@@ -451,7 +449,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   // Extract the  Schur vectors corresponding to the smallest Ritz values
   DenseMatrix Sr(it, nbrEig); 
   Sr.setZero();
-  for (int j = 0; j < nbrEig; j++)
+  for (Index j = 0; j < nbrEig; j++)
   {
     Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
   }
@@ -462,8 +460,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   if (m_r)
   {
    // Orthogonalize X against m_U using modified Gram-Schmidt
-   for (int j = 0; j < nbrEig; j++)
-     for (int k =0; k < m_r; k++)
+   for (Index j = 0; j < nbrEig; j++)
+     for (Index k =0; k < m_r; k++)
       X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); 
   }
   
@@ -473,7 +471,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
     dgmresInitDeflation(m); 
   DenseMatrix MX(m, nbrEig);
   DenseVector tv1(m);
-  for (int j = 0; j < nbrEig; j++)
+  for (Index j = 0; j < nbrEig; j++)
   {
     tv1 = mat * X.col(j);
     MX.col(j) = precond.solve(tv1);
@@ -488,8 +486,8 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
   }
   
   // Save X into m_U and m_MX in m_MU
-  for (int j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
-  for (int j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
   // Increase the size of the invariant subspace
   m_r += nbrEig; 
   
@@ -502,7 +500,7 @@ int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const Matri
 }
 template<typename _MatrixType, typename _Preconditioner>
 template<typename RhsType, typename DestType>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
 {
   DenseVector x1 = m_U.leftCols(m_r).transpose() * x; 
   y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);
diff --git a/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index 5a82b0df6..ff912094f 100644
--- a/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -21,7 +21,7 @@ namespace internal {
 *
 * Parameters:
 *  \param mat       matrix of linear system of equations
-*  \param Rhs       right hand side vector of linear system of equations
+*  \param rhs       right hand side vector of linear system of equations
 *  \param x         on input: initial guess, on output: solution
 *  \param precond   preconditioner used
 *  \param iters     on input: maximum number of iterations to perform
@@ -64,6 +64,15 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
   typedef Matrix < Scalar, Dynamic, 1 > VectorType;
   typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
 
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  if(rhs.norm() <= considerAsZero) 
+  {
+    x.setZero();
+    tol_error = 0;
+    return true;
+  }
+
   RealScalar tol = tol_error;
   const Index maxIters = iters;
   iters = 0;
@@ -307,31 +316,14 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
   {
-    bool failed = false;
-    for(Index j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-
-      typename Dest::ColXpr xj(x,j);
-      if(!internal::gmres(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
-        failed = true;
-    }
-    m_info = failed ? NumericalIssue
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error);
+    m_info = (!ret) ? NumericalIssue
           : m_error <= Base::m_tolerance ? Success
           : NoConvergence;
-    m_isInitialized = true;
-  }
-
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
-  {
-    x = b;
-    if(x.squaredNorm() == 0) return; // Check Zero right hand side
-    _solve_with_guess_impl(b,x.derived());
   }
 
 protected:
diff --git a/unsupported/Eigen/src/IterativeSolvers/IDRS.h b/unsupported/Eigen/src/IterativeSolvers/IDRS.h
new file mode 100755
index 000000000..90d20fad4
--- /dev/null
+++ b/unsupported/Eigen/src/IterativeSolvers/IDRS.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_IDRS_H
+#define EIGEN_IDRS_H
+
+namespace Eigen
+{
+
+	namespace internal
+	{
+		/**     \internal Low-level Induced Dimension Reduction algoritm
+		        \param A The matrix A
+		        \param b The right hand side vector b
+		        \param x On input and initial solution, on output the computed solution.
+		        \param precond A preconditioner being able to efficiently solve for an
+		                  approximation of Ax=b (regardless of b)
+		        \param iter On input the max number of iteration, on output the number of performed iterations.
+		        \param relres On input the tolerance error, on output an estimation of the relative error.
+		        \param S On input Number of the dimension of the shadow space.
+				\param smoothing switches residual smoothing on.
+				\param angle small omega lead to faster convergence at the expense of numerical stability
+				\param replacement switches on a residual replacement strategy to increase accuracy of residual at the expense of more Mat*vec products
+		        \return false in the case of numerical issue, for example a break down of IDRS.
+		*/
+		template<typename Vector, typename RealScalar>
+		typename Vector::Scalar omega(const Vector& t, const Vector& s, RealScalar angle)
+		{
+			using numext::abs;
+			typedef typename Vector::Scalar Scalar;
+			const RealScalar ns = s.norm();
+			const RealScalar nt = t.norm();
+			const Scalar ts = t.dot(s);
+			const RealScalar rho = abs(ts / (nt * ns));
+
+			if (rho < angle) {
+				if (ts == Scalar(0)) {
+					return Scalar(0);
+				}
+				// Original relation for om is given by
+				// om = om * angle / rho;
+				// To alleviate potential (near) division by zero this can be rewritten as
+				// om = angle * (ns / nt) * (ts / abs(ts)) = angle * (ns / nt) * sgn(ts)
+  				return angle * (ns / nt) * (ts / abs(ts));
+			}
+			return ts / (nt * nt);
+		}
+
+		template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+		bool idrs(const MatrixType& A, const Rhs& b, Dest& x, const Preconditioner& precond,
+			Index& iter,
+			typename Dest::RealScalar& relres, Index S, bool smoothing, typename Dest::RealScalar angle, bool replacement)
+		{
+			typedef typename Dest::RealScalar RealScalar;
+			typedef typename Dest::Scalar Scalar;
+			typedef Matrix<Scalar, Dynamic, 1> VectorType;
+			typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+			const Index N = b.size();
+			S = S < x.rows() ? S : x.rows();
+			const RealScalar tol = relres;
+			const Index maxit = iter;
+
+			Index replacements = 0;
+			bool trueres = false;
+
+			FullPivLU<DenseMatrixType> lu_solver;
+
+			DenseMatrixType P;
+			{
+				HouseholderQR<DenseMatrixType> qr(DenseMatrixType::Random(N, S));
+			    P = (qr.householderQ() * DenseMatrixType::Identity(N, S));
+			}
+
+			const RealScalar normb = b.norm();
+
+			if (internal::isApprox(normb, RealScalar(0)))
+			{
+				//Solution is the zero vector
+				x.setZero();
+				iter = 0;
+				relres = 0;
+				return true;
+			}
+			 // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf
+			 // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon).
+			 // With epsilon the
+             // relative machine precision. The factor tol/epsilon corresponds to the size of a
+             // finite precision number that is so large that the absolute round-off error in
+             // this number, when propagated through the process, makes it impossible to
+             // achieve the required accuracy.The factor C accounts for the accumulation of
+             // round-off errors. This parameter has beenset to 10−3.
+			 // mp is epsilon/C
+			 // 10^3 * eps is very conservative, so normally no residual replacements will take place. 
+			 // It only happens if things go very wrong. Too many restarts may ruin the convergence.
+			const RealScalar mp = RealScalar(1e3) * NumTraits<Scalar>::epsilon();
+
+
+
+			//Compute initial residual
+			const RealScalar tolb = tol * normb; //Relative tolerance
+			VectorType r = b - A * x;
+
+			VectorType x_s, r_s;
+
+			if (smoothing)
+			{
+				x_s = x;
+				r_s = r;
+			}
+
+			RealScalar normr = r.norm();
+
+			if (normr <= tolb)
+			{
+				//Initial guess is a good enough solution
+				iter = 0;
+				relres = normr / normb;
+				return true;
+			}
+
+			DenseMatrixType G = DenseMatrixType::Zero(N, S);
+			DenseMatrixType U = DenseMatrixType::Zero(N, S);
+			DenseMatrixType M = DenseMatrixType::Identity(S, S);
+			VectorType t(N), v(N);
+			Scalar om = 1.;
+
+			//Main iteration loop, guild G-spaces:
+			iter = 0;
+
+			while (normr > tolb && iter < maxit)
+			{
+				//New right hand size for small system:
+				VectorType f = (r.adjoint() * P).adjoint();
+
+				for (Index k = 0; k < S; ++k)
+				{
+					//Solve small system and make v orthogonal to P:
+					//c = M(k:s,k:s)\f(k:s);
+					lu_solver.compute(M.block(k , k , S -k, S - k ));
+					VectorType c = lu_solver.solve(f.segment(k , S - k ));
+					//v = r - G(:,k:s)*c;
+					v = r - G.rightCols(S - k ) * c;
+					//Preconditioning
+					v = precond.solve(v);
+
+					//Compute new U(:,k) and G(:,k), G(:,k) is in space G_j
+					U.col(k) = U.rightCols(S - k ) * c + om * v;
+					G.col(k) = A * U.col(k );
+
+					//Bi-Orthogonalise the new basis vectors:
+					for (Index i = 0; i < k-1 ; ++i)
+					{
+						//alpha =  ( P(:,i)'*G(:,k) )/M(i,i);
+						Scalar alpha = P.col(i ).dot(G.col(k )) / M(i, i );
+						G.col(k ) = G.col(k ) - alpha * G.col(i );
+						U.col(k ) = U.col(k ) - alpha * U.col(i );
+					}
+
+					//New column of M = P'*G  (first k-1 entries are zero)
+					//M(k:s,k) = (G(:,k)'*P(:,k:s))';
+					M.block(k , k , S - k , 1) = (G.col(k ).adjoint() * P.rightCols(S - k )).adjoint();
+
+					if (internal::isApprox(M(k,k), Scalar(0)))
+					{
+						return false;
+					}
+
+					//Make r orthogonal to q_i, i = 0..k-1
+					Scalar beta = f(k ) / M(k , k );
+					r = r - beta * G.col(k );
+					x = x + beta * U.col(k );
+					normr = r.norm();
+
+					if (replacement && normr > tolb / mp)
+					{
+						trueres = true;
+					}
+
+					//Smoothing:
+					if (smoothing)
+					{
+						t = r_s - r;
+						//gamma is a Scalar, but the conversion is not allowed
+						Scalar gamma = t.dot(r_s) / t.norm();
+						r_s = r_s - gamma * t;
+						x_s = x_s - gamma * (x_s - x);
+						normr = r_s.norm();
+					}
+
+					if (normr < tolb || iter == maxit)
+					{
+						break;
+					}
+
+					//New f = P'*r (first k  components are zero)
+					if (k < S-1)
+					{
+						f.segment(k + 1, S - (k + 1) ) = f.segment(k + 1 , S - (k + 1)) - beta * M.block(k + 1 , k , S - (k + 1), 1);
+					}
+				}//end for
+
+				if (normr < tolb || iter == maxit)
+				{
+					break;
+				}
+
+				//Now we have sufficient vectors in G_j to compute residual in G_j+1
+				//Note: r is already perpendicular to P so v = r
+				//Preconditioning
+				v = r;
+				v = precond.solve(v);
+
+				//Matrix-vector multiplication:
+				t = A * v;
+
+				//Computation of a new omega
+				om = internal::omega(t, r, angle);
+
+				if (om == RealScalar(0.0))
+				{
+					return false;
+				}
+
+				r = r - om * t;
+				x = x + om * v;
+				normr = r.norm();
+
+				if (replacement && normr > tolb / mp)
+				{
+					trueres = true;
+				}
+
+				//Residual replacement?
+				if (trueres && normr < normb)
+				{
+					r = b - A * x;
+					trueres = false;
+					replacements++;
+				}
+
+				//Smoothing:
+				if (smoothing)
+				{
+					t = r_s - r;
+					Scalar gamma = t.dot(r_s) /t.norm();
+					r_s = r_s - gamma * t;
+					x_s = x_s - gamma * (x_s - x);
+					normr = r_s.norm();
+				}
+
+				iter++;
+
+			}//end while
+
+			if (smoothing)
+			{
+				x = x_s;
+			}
+			relres=normr/normb;
+			return true;
+		}
+
+	}  // namespace internal
+
+	template <typename _MatrixType, typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+	class IDRS;
+
+	namespace internal
+	{
+
+		template <typename _MatrixType, typename _Preconditioner>
+		struct traits<Eigen::IDRS<_MatrixType, _Preconditioner> >
+		{
+			typedef _MatrixType MatrixType;
+			typedef _Preconditioner Preconditioner;
+		};
+
+	}  // namespace internal
+
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief The Induced Dimension Reduction method (IDR(s)) is a short-recurrences Krylov method for sparse square problems.
+  *
+  * This class allows to solve for A.x = b sparse linear problems. The vectors x and b can be either dense or sparse.
+  * he Induced Dimension Reduction method, IDR(), is a robust and efficient short-recurrence Krylov subspace method for
+  * solving large nonsymmetric systems of linear equations.
+  *
+  * For indefinite systems IDR(S) outperforms both BiCGStab and BiCGStab(L). Additionally, IDR(S) can handle matrices
+  * with complex eigenvalues more efficiently than BiCGStab.
+  *
+  * Many problems that do not converge for BiCGSTAB converge for IDR(s) (for larger values of s). And if both methods 
+  * converge the convergence for IDR(s) is typically much faster for difficult systems (for example indefinite problems). 
+  *
+  * IDR(s) is a limited memory finite termination method. In exact arithmetic it converges in at most N+N/s iterations,
+  * with N the system size.  It uses a fixed number of 4+3s vector. In comparison, BiCGSTAB terminates in 2N iterations 
+  * and uses 7 vectors. GMRES terminates in at most N iterations, and uses I+3 vectors, with I the number of iterations. 
+  * Restarting GMRES limits the memory consumption, but destroys the finite termination property.
+  *
+  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  *
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  *
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  *
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  *
+  * IDR(s) can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  */
+	template <typename _MatrixType, typename _Preconditioner>
+	class IDRS : public IterativeSolverBase<IDRS<_MatrixType, _Preconditioner> >
+	{
+
+		public:
+			typedef _MatrixType MatrixType;
+			typedef typename MatrixType::Scalar Scalar;
+			typedef typename MatrixType::RealScalar RealScalar;
+			typedef _Preconditioner Preconditioner;
+
+		private:
+			typedef IterativeSolverBase<IDRS> Base;
+			using Base::m_error;
+			using Base::m_info;
+			using Base::m_isInitialized;
+			using Base::m_iterations;
+			using Base::matrix;
+			Index m_S;
+			bool m_smoothing;
+			RealScalar m_angle;
+			bool m_residual;
+
+		public:
+			/** Default constructor. */
+			IDRS(): m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {}
+
+			/**     Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+			        This constructor is a shortcut for the default constructor followed
+			        by a call to compute().
+
+			        \warning this class stores a reference to the matrix A as well as some
+			        precomputed values that depend on it. Therefore, if \a A is changed
+			        this class becomes invalid. Call compute() to update it with the new
+			        matrix A, or modify a copy of A.
+			*/
+			template <typename MatrixDerived>
+			explicit IDRS(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_S(4), m_smoothing(false),
+															   m_angle(RealScalar(0.7)), m_residual(false) {}
+
+
+			/** \internal */
+			/**     Loops over the number of columns of b and does the following:
+			                1. sets the tolerence and maxIterations
+			                2. Calls the function that has the core solver routine
+			*/
+			template <typename Rhs, typename Dest>
+			void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+			{
+				m_iterations = Base::maxIterations();
+				m_error = Base::m_tolerance;
+
+				bool ret = internal::idrs(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_S,m_smoothing,m_angle,m_residual);
+
+				m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
+			}
+
+			/** Sets the parameter S, indicating the dimension of the shadow space. Default is 4*/
+			void setS(Index S)
+			{
+				if (S < 1)
+				{
+					S = 4;
+				}
+
+				m_S = S;
+			}
+
+			/** Switches off and on smoothing.
+			Residual smoothing results in monotonically decreasing residual norms at
+			the expense of two extra vectors of storage and a few extra vector
+			operations. Although monotonic decrease of the residual norms is a
+			desirable property, the rate of convergence of the unsmoothed process and
+			the smoothed process is basically the same. Default is off */
+			void setSmoothing(bool smoothing)
+			{
+				m_smoothing=smoothing;
+			}
+
+			/** The angle must be a real scalar. In IDR(s), a value for the
+			iteration parameter omega must be chosen in every s+1th step. The most
+			natural choice is to select a value to minimize the norm of the next residual.
+			This corresponds to the parameter omega = 0. In practice, this may lead to
+			values of omega that are so small that the other iteration parameters
+			cannot be computed with sufficient accuracy. In such cases it is better to
+			increase the value of omega sufficiently such that a compromise is reached
+			between accurate computations and reduction of the residual norm. The
+			parameter angle =0.7 (”maintaining the convergence strategy”)
+			results in such a compromise. */
+			void setAngle(RealScalar angle)
+			{
+				m_angle=angle;
+			}
+
+			/** The parameter replace is a logical that determines whether a
+			residual replacement strategy is employed to increase the accuracy of the
+			solution. */
+			void setResidualUpdate(bool update)
+			{
+				m_residual=update;
+			}
+
+	};
+
+}  // namespace Eigen
+
+#endif /* EIGEN_IDRS_H */
diff --git a/unsupported/Eigen/src/IterativeSolvers/IterationController.h b/unsupported/Eigen/src/IterativeSolvers/IterationController.h
index c9c1a4be2..a116e09e2 100644
--- a/unsupported/Eigen/src/IterativeSolvers/IterationController.h
+++ b/unsupported/Eigen/src/IterativeSolvers/IterationController.h
@@ -60,7 +60,7 @@
 
 namespace Eigen { 
 
-/** \ingroup IterativeSolvers_Module
+/** \ingroup IterativeLinearSolvers_Module
   * \class IterationController
   *
   * \brief Controls the iterations of the iterative solvers
diff --git a/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 256990c1a..5db454d24 100644
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -64,8 +65,6 @@ namespace Eigen {
             eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
             RealScalar beta_new(sqrt(beta_new2));
             const RealScalar beta_one(beta_new);
-            v_new /= beta_new;
-            w_new /= beta_new;
             // Initialize other variables
             RealScalar c(1.0); // the cosine of the Givens rotation
             RealScalar c_old(1.0);
@@ -83,18 +82,18 @@ namespace Eigen {
                 /* Note that there are 4 variants on the Lanczos algorithm. These are
                  * described in Paige, C. C. (1972). Computational variants of
                  * the Lanczos method for the eigenproblem. IMA Journal of Applied
-                 * Mathematics, 10(3), 373–381. The current implementation corresponds 
+                 * Mathematics, 10(3), 373-381. The current implementation corresponds 
                  * to the case A(2,7) in the paper. It also corresponds to 
-                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
                  * Systems, 2003 p.173. For the preconditioned version see 
                  * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
                  */
                 const RealScalar beta(beta_new);
                 v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
-//                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
+                v_new /= beta_new; // overwrite v_new for next iteration
+                w_new /= beta_new; // overwrite w_new for next iteration
                 v = v_new; // update
                 w = w_new; // update
-//                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
                 v_new.noalias() = mat*w - beta*v_old; // compute v_new
                 const RealScalar alpha = v_new.dot(w);
                 v_new -= alpha*v; // overwrite v_new
@@ -102,8 +101,6 @@ namespace Eigen {
                 beta_new2 = v_new.dot(w_new); // compute beta_new
                 eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
                 beta_new = sqrt(beta_new2); // compute beta_new
-                v_new /= beta_new; // overwrite v_new for next iteration
-                w_new /= beta_new; // overwrite w_new for next iteration
                 
                 // Givens rotation
                 const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
@@ -117,7 +114,6 @@ namespace Eigen {
                 
                 // Update solution
                 p_oold = p_old;
-//                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
                 p_old = p;
                 p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
                 x += beta_one*c*eta*p;
@@ -237,7 +233,7 @@ namespace Eigen {
 
         /** \internal */
         template<typename Rhs,typename Dest>
-        void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+        void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
         {
             typedef typename Base::MatrixWrapper MatrixWrapper;
             typedef typename Base::ActualMatrixType ActualMatrixType;
@@ -257,28 +253,11 @@ namespace Eigen {
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
             RowMajorWrapper row_mat(matrix());
-            for(int j=0; j<b.cols(); ++j)
-            {
-                m_iterations = Base::maxIterations();
-                m_error = Base::m_tolerance;
-                
-                typename Dest::ColXpr xj(x,j);
-                internal::minres(SelfAdjointWrapper(row_mat), b.col(j), xj,
-                                 Base::m_preconditioner, m_iterations, m_error);
-            }
-            
-            m_isInitialized = true;
+            internal::minres(SelfAdjointWrapper(row_mat), b, x,
+                             Base::m_preconditioner, m_iterations, m_error);
             m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
         }
         
-        /** \internal */
-        template<typename Rhs,typename Dest>
-        void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
-        {
-            x.setZero();
-            _solve_with_guess_impl(b,x.derived());
-        }
-        
     protected:
         
     };
@@ -286,4 +265,3 @@ namespace Eigen {
 } // end namespace Eigen
 
 #endif // EIGEN_MINRES_H
-
diff --git a/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index d113e6e90..9b3eb53e0 100644
--- a/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -104,12 +104,18 @@ class IterScaling
         for (int i = 0; i < m; ++i) 
         {
           Dr(i) = std::sqrt(Dr(i));
+        }
+        for (int i = 0; i < n; ++i) 
+        {
           Dc(i) = std::sqrt(Dc(i));
         }
         // Save the scaling factors 
         for (int i = 0; i < m; ++i) 
         {
           m_left(i) /= Dr(i);
+        }
+        for (int i = 0; i < n; ++i) 
+        {
           m_right(i) /= Dc(i);
         }
         // Scale the rows and the columns of the matrix
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 582fa8512..6a9b0be88 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -235,10 +235,10 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
     MaxRowsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
     MaxColsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
 
-    EvalToRowMajor = (LhsFlags & RhsFlags & RowMajorBit),
+    EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit),
     RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
 
-    Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits)
           | EvalBeforeNestingBit,
     CoeffReadCost = HugeCost
   };
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index ae9d793b1..123485817 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
@@ -73,7 +73,7 @@ void lmqrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 995427978..62561da1d 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -117,7 +117,7 @@ class LevenbergMarquardt : internal::no_assignment_operator
     typedef typename JacobianType::RealScalar RealScalar; 
     typedef typename QRSolver::StorageIndex PermIndex;
     typedef Matrix<Scalar,Dynamic,1> FVectorType;
-    typedef PermutationMatrix<Dynamic,Dynamic> PermutationType;
+    typedef PermutationMatrix<Dynamic,Dynamic,int> PermutationType;
   public:
     LevenbergMarquardt(FunctorType& functor) 
     : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
@@ -233,9 +233,9 @@ class LevenbergMarquardt : internal::no_assignment_operator
     
     /** 
      * \brief Reports whether the minimization was successful
-     * \returns \c Success if the minimization was succesful,
+     * \returns \c Success if the minimization was successful,
      *         \c NumericalIssue if a numerical problem arises during the 
-     *          minimization process, for exemple during the QR factorization
+     *          minimization process, for example during the QR factorization
      *         \c NoConvergence if the minimization did not converge after 
      *          the maximum number of function evaluation allowed
      *          \c InvalidInput if the input matrix is invalid
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index bb6d9e1fe..02284b0dd 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -234,12 +234,13 @@ struct matrix_exp_computeUV<MatrixType, float>
 template <typename MatrixType>
 struct matrix_exp_computeUV<MatrixType, double>
 {
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
   template <typename ArgType>
   static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
   {
     using std::frexp;
     using std::pow;
-    const double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    const RealScalar l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
     squarings = 0;
     if (l1norm < 1.495585217958292e-002) {
       matrix_exp_pade3(arg, U, V);
@@ -250,10 +251,10 @@ struct matrix_exp_computeUV<MatrixType, double>
     } else if (l1norm < 2.097847961257068e+000) {
       matrix_exp_pade9(arg, U, V);
     } else {
-      const double maxnorm = 5.371920351148152;
+      const RealScalar maxnorm = 5.371920351148152;
       frexp(l1norm / maxnorm, &squarings);
       if (squarings < 0) squarings = 0;
-      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<double>(squarings));
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<RealScalar>(squarings));
       matrix_exp_pade13(A, U, V);
     }
   }
@@ -313,7 +314,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
       matrix_exp_pade17(A, U, V);
     }
   
-#elif LDBL_MANT_DIG <= 112  // quadruple precison
+#elif LDBL_MANT_DIG <= 113  // quadruple precision
   
     if (l1norm < 1.639394610288918690547467954466970e-005L) {
       matrix_exp_pade3(arg, U, V);
@@ -326,6 +327,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
     } else if (l1norm < 1.125358383453143065081397882891878e+000L) {
       matrix_exp_pade13(arg, U, V);
     } else {
+      const long double maxnorm = 2.884233277829519311757165057717815L;
       frexp(l1norm / maxnorm, &squarings);
       if (squarings < 0) squarings = 0;
       MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
@@ -342,6 +344,27 @@ struct matrix_exp_computeUV<MatrixType, long double>
   }
 };
 
+template<typename T> struct is_exp_known_type : false_type {};
+template<> struct is_exp_known_type<float> : true_type {};
+template<> struct is_exp_known_type<double> : true_type {};
+#if LDBL_MANT_DIG <= 113
+template<> struct is_exp_known_type<long double> : true_type {};
+#endif
+
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type
+{
+  typedef typename ArgType::PlainObject MatrixType;
+  MatrixType U, V;
+  int squarings;
+  matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
+  MatrixType numer = U + V;
+  MatrixType denom = -U + V;
+  result = denom.partialPivLu().solve(numer);
+  for (int i=0; i<squarings; i++)
+    result *= result;   // undo scaling by repeated squaring
+}
+
 
 /* Computes the matrix exponential
  *
@@ -349,26 +372,13 @@ struct matrix_exp_computeUV<MatrixType, long double>
  * \param result variable in which result will be stored
  */
 template <typename ArgType, typename ResultType>
-void matrix_exp_compute(const ArgType& arg, ResultType &result)
+void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default
 {
   typedef typename ArgType::PlainObject MatrixType;
-#if LDBL_MANT_DIG > 112 // rarely happens
   typedef typename traits<MatrixType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename std::complex<RealScalar> ComplexScalar;
-  if (sizeof(RealScalar) > 14) {
-    result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
-    return;
-  }
-#endif
-  MatrixType U, V;
-  int squarings; 
-  matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
-  MatrixType numer = U + V;
-  MatrixType denom = -U + V;
-  result = denom.partialPivLu().solve(numer);
-  for (int i=0; i<squarings; i++)
-    result *= result;   // undo scaling by repeated squaring
+  result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
 }
 
 } // end namespace Eigen::internal
@@ -386,7 +396,6 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result)
 template<typename Derived> struct MatrixExponentialReturnValue
 : public ReturnByValue<MatrixExponentialReturnValue<Derived> >
 {
-    typedef typename Derived::Index Index;
   public:
     /** \brief Constructor.
       *
@@ -402,7 +411,7 @@ template<typename Derived> struct MatrixExponentialReturnValue
     inline void evalTo(ResultType& result) const
     {
       const typename internal::nested_eval<Derived, 10>::type tmp(m_src);
-      internal::matrix_exp_compute(tmp, result);
+      internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type<typename Derived::RealScalar>());
     }
 
     Index rows() const { return m_src.rows(); }
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 3f7d77710..cc12ab62b 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MATRIX_FUNCTION
-#define EIGEN_MATRIX_FUNCTION
+#ifndef EIGEN_MATRIX_FUNCTION_H
+#define EIGEN_MATRIX_FUNCTION_H
 
 #include "StemFunction.h"
 
@@ -53,7 +53,7 @@ template <typename MatrixType>
 typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
 {
   typedef typename plain_col_type<MatrixType>::type VectorType;
-  typename MatrixType::Index rows = A.rows();
+  Index rows = A.rows();
   const MatrixType N = MatrixType::Identity(rows, rows) - A;
   VectorType e = VectorType::Ones(rows);
   N.template triangularView<Upper>().solveInPlace(e);
@@ -65,7 +65,6 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
 {
   // TODO: Use that A is upper triangular
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
   Index rows = A.rows();
   Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
   MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
@@ -73,10 +72,10 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
   MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
   MatrixType P = Ashifted;
   MatrixType Fincr;
-  for (Index s = 1; s < 1.1 * rows + 10; s++) { // upper limit is fairly arbitrary
+  for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary
     Fincr = m_f(avgEival, static_cast<int>(s)) * P;
     F += Fincr;
-    P = Scalar(RealScalar(1.0/(s + 1))) * P * Ashifted;
+    P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted;
 
     // test whether Taylor series converged
     const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
@@ -131,7 +130,6 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf
 template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
-  typedef typename EivalsType::Index Index;
   typedef typename EivalsType::RealScalar RealScalar;
   for (Index i=0; i<eivals.rows(); ++i) {
     // Find cluster containing i-th ei'val, adding a new cluster if necessary
@@ -179,7 +177,7 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 {
   blockStart.resize(clusterSize.rows());
   blockStart(0) = 0;
-  for (typename VectorType::Index i = 1; i < clusterSize.rows(); i++) {
+  for (Index i = 1; i < clusterSize.rows(); i++) {
     blockStart(i) = blockStart(i-1) + clusterSize(i-1);
   }
 }
@@ -188,7 +186,6 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 template <typename EivalsType, typename ListOfClusters, typename VectorType>
 void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
 {
-  typedef typename EivalsType::Index Index;
   eivalToCluster.resize(eivals.rows());
   Index clusterIndex = 0;
   for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
@@ -205,7 +202,6 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters&
 template <typename DynVectorType, typename VectorType>
 void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
 {
-  typedef typename VectorType::Index Index;
   DynVectorType indexNextEntry = blockStart;
   permutation.resize(eivalToCluster.rows());
   for (Index i = 0; i < eivalToCluster.rows(); i++) {
@@ -219,7 +215,6 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const
 template <typename VectorType, typename MatrixType>
 void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
 {
-  typedef typename VectorType::Index Index;
   for (Index i = 0; i < permutation.rows() - 1; i++) {
     Index j;
     for (j = i; j < permutation.rows(); j++) {
@@ -247,7 +242,7 @@ template <typename MatrixType, typename AtomicType, typename VectorType>
 void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
 { 
   fT.setZero(T.rows(), T.cols());
-  for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) {
+  for (Index i = 0; i < clusterSize.rows(); ++i) {
     fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
       = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
   }
@@ -285,7 +280,6 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const
   eigen_assert(C.rows() == A.rows());
   eigen_assert(C.cols() == B.rows());
 
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index m = A.rows();
@@ -330,11 +324,8 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp
 { 
   typedef internal::traits<MatrixType> Traits;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
-  static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-  static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
   static const int Options = MatrixType::Options;
-  typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
 
   for (Index k = 1; k < clusterSize.rows(); k++) {
     for (Index i = 0; i < clusterSize.rows() - k; i++) {
@@ -428,7 +419,8 @@ struct matrix_function_compute<MatrixType, 1>
     typedef internal::traits<MatrixType> Traits;
     
     // compute Schur decomposition of A
-    const ComplexSchur<MatrixType> schurOfA(A);  
+    const ComplexSchur<MatrixType> schurOfA(A);
+    eigen_assert(schurOfA.info()==Success);
     MatrixType T = schurOfA.matrixT();
     MatrixType U = schurOfA.matrixU();
 
@@ -480,7 +472,6 @@ template<typename Derived> class MatrixFunctionReturnValue
 {
   public:
     typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
     typedef typename internal::stem_function<Scalar>::type StemFunction;
 
   protected:
@@ -505,10 +496,8 @@ template<typename Derived> class MatrixFunctionReturnValue
       typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
       typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
       typedef internal::traits<NestedEvalTypeClean> Traits;
-      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
 
       typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
       AtomicType atomic(m_f);
@@ -577,4 +566,4 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
 
 } // end namespace Eigen
 
-#endif // EIGEN_MATRIX_FUNCTION
+#endif // EIGEN_MATRIX_FUNCTION_H
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index ff8f6e732..e917013e0 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -62,8 +62,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   else
   {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
-    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
+    RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y;
   }
 }
 
@@ -135,7 +135,8 @@ void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree
   const int minPadeDegree = 3;
   const int maxPadeDegree = 11;
   assert(degree >= minPadeDegree && degree <= maxPadeDegree);
-
+  // FIXME this creates float-conversion-warnings if these are enabled.
+  // Either manually convert each value, or disable the warning locally
   const RealScalar nodes[][maxPadeDegree] = { 
     { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
       0.8872983346207416885179265399782400L }, 
@@ -232,12 +233,13 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   int degree;
   MatrixType T = A, sqrtT;
 
-  int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
-  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
+  const int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+  const RealScalar maxNormForPade = RealScalar(
+                                    maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
                                     maxPadeDegree<= 7? 2.6429608311114350e-1L:                    // double precision
                                     maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
                                     maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
-                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
+                                                       1.1880960220216759245467951592883642e-1L); // quadruple precision
 
   while (true) {
     RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
@@ -254,7 +256,7 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   }
 
   matrix_log_compute_pade(result, T, degree);
-  result *= pow(RealScalar(2), numberOfSquareRoots);
+  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible
 }
 
 /** \ingroup MatrixFunctions_Module
@@ -324,7 +326,7 @@ public:
   
   /** \brief Compute the matrix logarithm.
     *
-    * \param[out]  result  Logarithm of \p A, where \A is as specified in the constructor.
+    * \param[out]  result  Logarithm of \c A, where \c A is as specified in the constructor.
     */
   template <typename ResultType>
   inline void evalTo(ResultType& result) const
@@ -332,10 +334,8 @@ public:
     typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
     typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
     typedef internal::traits<DerivedEvalTypeClean> Traits;
-    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
     typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
     
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index ebc433d89..d7672d7c9 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -40,7 +40,6 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
 {
   public:
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     /**
      * \brief Constructor.
@@ -57,8 +56,8 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
      * \param[out] result
      */
     template<typename ResultType>
-    inline void evalTo(ResultType& res) const
-    { m_pow.compute(res, m_p); }
+    inline void evalTo(ResultType& result) const
+    { m_pow.compute(result, m_p); }
 
     Index rows() const { return m_pow.rows(); }
     Index cols() const { return m_pow.cols(); }
@@ -81,7 +80,7 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
  *
  * \note Currently this class is only used by MatrixPower. One may
  * insist that this be nested into MatrixPower. This class is here to
- * faciliate future development of triangular matrix functions.
+ * facilitate future development of triangular matrix functions.
  */
 template<typename MatrixType>
 class MatrixPowerAtomic : internal::noncopyable
@@ -94,7 +93,6 @@ class MatrixPowerAtomic : internal::noncopyable
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef std::complex<RealScalar> ComplexScalar;
-    typedef typename MatrixType::Index Index;
     typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
 
     const MatrixType& m_A;
@@ -162,11 +160,11 @@ template<typename MatrixType>
 void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const
 {
   int i = 2*degree;
-  res = (m_p-degree) / (2*i-2) * IminusT;
+  res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT;
 
   for (--i; i; --i) {
     res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
-	.solve((i==1 ? -m_p : i&1 ? (-m_p-i/2)/(2*i) : (m_p-i/2)/(2*i-2)) * IminusT).eval();
+	.solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval();
   }
   res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
 }
@@ -196,11 +194,12 @@ void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
 {
   using std::ldexp;
   const int digits = std::numeric_limits<RealScalar>::digits;
-  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1L                            // single precision
+  const RealScalar maxNormForPade = RealScalar(
+                                    digits <=  24? 4.3386528e-1L                            // single precision
                                   : digits <=  53? 2.789358995219730e-1L                    // double precision
                                   : digits <=  64? 2.4471944416607995472e-1L                // extended precision
                                   : digits <= 106? 1.1016843812851143391275867258512e-1L    // double-double
-                                  :                9.134603732914548552537150753385375e-2L; // quadruple precision
+                                  :                9.134603732914548552537150753385375e-2L); // quadruple precision
   MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
   RealScalar normIminusT;
   int degree, degree2, numberOfSquareRoots = 0;
@@ -298,8 +297,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
 
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
-  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
-  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
+  RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
@@ -340,7 +339,6 @@ class MatrixPower : internal::noncopyable
   private:
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
   public:
     /**
@@ -600,7 +598,6 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename Derived::RealScalar RealScalar;
-    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
@@ -618,8 +615,8 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
      * constructor.
      */
     template<typename ResultType>
-    inline void evalTo(ResultType& res) const
-    { MatrixPower<PlainObject>(m_A.eval()).compute(res, m_p); }
+    inline void evalTo(ResultType& result) const
+    { MatrixPower<PlainObject>(m_A.eval()).compute(result, m_p); }
 
     Index rows() const { return m_A.rows(); }
     Index cols() const { return m_A.cols(); }
@@ -648,7 +645,6 @@ class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerRe
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
-    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
@@ -669,8 +665,8 @@ class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerRe
      * constructor.
      */
     template<typename ResultType>
-    inline void evalTo(ResultType& res) const
-    { res = (m_p * m_A.log()).exp(); }
+    inline void evalTo(ResultType& result) const
+    { result = (m_p * m_A.log()).exp(); }
 
     Index rows() const { return m_A.rows(); }
     Index cols() const { return m_A.cols(); }
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index afd88ec4d..e363e779d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -17,7 +17,7 @@ namespace internal {
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
 {
   // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
   //       in EigenSolver. If we expose it, we could call it directly from here.
@@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typena
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
@@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, ty
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
@@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, ty
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
@@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
@@ -120,7 +120,6 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT)
 {
   using std::sqrt;
-  typedef typename MatrixType::Index Index;
   const Index size = T.rows();
   for (Index i = 0; i < size; i++) {
     if (i == size - 1 || T.coeff(i+1, i) == 0) {
@@ -139,7 +138,6 @@ void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrt
 template <typename MatrixType, typename ResultType>
 void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT)
 {
-  typedef typename MatrixType::Index Index;
   const Index size = T.rows();
   for (Index j = 1; j < size; j++) {
       if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
@@ -206,8 +204,7 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
 {
   using std::sqrt;
-  typedef typename MatrixType::Index Index;
-      typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Scalar Scalar;
 
   eigen_assert(arg.rows() == arg.cols());
 
@@ -256,18 +253,19 @@ struct matrix_sqrt_compute
 template <typename MatrixType>
 struct matrix_sqrt_compute<MatrixType, 0>
 {
+  typedef typename MatrixType::PlainObject PlainType;
   template <typename ResultType>
   static void run(const MatrixType &arg, ResultType &result)
   {
     eigen_assert(arg.rows() == arg.cols());
 
     // Compute Schur decomposition of arg
-    const RealSchur<MatrixType> schurOfA(arg);  
-    const MatrixType& T = schurOfA.matrixT();
-    const MatrixType& U = schurOfA.matrixU();
+    const RealSchur<PlainType> schurOfA(arg);
+    const PlainType& T = schurOfA.matrixT();
+    const PlainType& U = schurOfA.matrixU();
     
     // Compute square root of T
-    MatrixType sqrtT = MatrixType::Zero(arg.rows(), arg.cols());
+    PlainType sqrtT = PlainType::Zero(arg.rows(), arg.cols());
     matrix_sqrt_quasi_triangular(T, sqrtT);
     
     // Compute square root of arg
@@ -281,18 +279,19 @@ struct matrix_sqrt_compute<MatrixType, 0>
 template <typename MatrixType>
 struct matrix_sqrt_compute<MatrixType, 1>
 {
+  typedef typename MatrixType::PlainObject PlainType;
   template <typename ResultType>
   static void run(const MatrixType &arg, ResultType &result)
   {
     eigen_assert(arg.rows() == arg.cols());
 
     // Compute Schur decomposition of arg
-    const ComplexSchur<MatrixType> schurOfA(arg);  
-    const MatrixType& T = schurOfA.matrixT();
-    const MatrixType& U = schurOfA.matrixU();
+    const ComplexSchur<PlainType> schurOfA(arg);
+    const PlainType& T = schurOfA.matrixT();
+    const PlainType& U = schurOfA.matrixU();
     
     // Compute square root of T
-    MatrixType sqrtT;
+    PlainType sqrtT;
     matrix_sqrt_triangular(T, sqrtT);
     
     // Compute square root of arg
@@ -318,7 +317,6 @@ template<typename Derived> class MatrixSquareRootReturnValue
 : public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
 {
   protected:
-    typedef typename Derived::Index Index;
     typedef typename internal::ref_selector<Derived>::type DerivedNested;
 
   public:
diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index 8fe3ed86b..07c5ef014 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -52,7 +52,7 @@ public:
         Parameters()
             : factor(Scalar(100.))
             , maxfev(1000)
-            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
+            , xtol(numext::sqrt(NumTraits<Scalar>::epsilon()))
             , nb_of_subdiagonals(-1)
             , nb_of_superdiagonals(-1)
             , epsfcn(Scalar(0.)) {}
@@ -70,7 +70,7 @@ public:
 
     HybridNonLinearSolverSpace::Status hybrj1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
             );
 
     HybridNonLinearSolverSpace::Status solveInit(FVectorType  &x);
@@ -79,7 +79,7 @@ public:
 
     HybridNonLinearSolverSpace::Status hybrd1(
             FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
             );
 
     HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType  &x);
diff --git a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
index feafd62a8..4f2f560b3 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
@@ -61,7 +61,7 @@ void qrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index f28766061..09fc65255 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -22,7 +22,7 @@ void r1updt(
     Scalar temp;
     JacobiRotation<Scalar> givens;
 
-    // r1updt had a broader usecase, but we dont use it here. And, more
+    // r1updt had a broader usecase, but we don't use it here. And, more
     // importantly, we can not test it.
     eigen_assert(m==n);
     eigen_assert(u.size()==m);
diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h
index b515c2920..59a15b098 100644
--- a/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/unsupported/Eigen/src/Polynomials/Companion.h
@@ -20,12 +20,6 @@ namespace internal {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-template <typename T>
-T radix(){ return 2; }
-
-template <typename T>
-T radix2(){ return radix<T>()*radix<T>(); }
-
 template<int Size>
 struct decrement_if_fixed_size
 {
@@ -75,8 +69,7 @@ class companion
     void setPolynomial( const VectorType& poly )
     {
       const Index deg = poly.size()-1;
-      m_monic = -1/poly[deg] * poly.head(deg);
-      //m_bl_diag.setIdentity( deg-1 );
+      m_monic = -poly.head(deg)/poly[deg];
       m_bl_diag.setOnes(deg-1);
     }
 
@@ -89,13 +82,13 @@ class companion
     {
       const Index deg   = m_monic.size();
       const Index deg_1 = deg-1;
-      DenseCompanionMatrixType companion(deg,deg);
-      companion <<
+      DenseCompanionMatrixType companMat(deg,deg);
+      companMat <<
         ( LeftBlock(deg,deg_1)
           << LeftBlockFirstRow::Zero(1,deg_1),
           BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
         , m_monic;
-      return companion;
+      return companMat;
     }
 
 
@@ -104,20 +97,20 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
-    bool balanced( Scalar colNorm, Scalar rowNorm,
-        bool& isBalanced, Scalar& colB, Scalar& rowB );
+    bool balanced( RealScalar colNorm, RealScalar rowNorm,
+        bool& isBalanced, RealScalar& colB, RealScalar& rowB );
 
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
-    bool balancedR( Scalar colNorm, Scalar rowNorm,
-        bool& isBalanced, Scalar& colB, Scalar& rowB );
+    bool balancedR( RealScalar colNorm, RealScalar rowNorm,
+        bool& isBalanced, RealScalar& colB, RealScalar& rowB );
 
   public:
     /**
@@ -139,10 +132,13 @@ class companion
 
 template< typename _Scalar, int _Deg >
 inline
-bool companion<_Scalar,_Deg>::balanced( Scalar colNorm, Scalar rowNorm,
-    bool& isBalanced, Scalar& colB, Scalar& rowB )
+bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
+    bool& isBalanced, RealScalar& colB, RealScalar& rowB )
 {
-  if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
+  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm 
+      || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){
+    return true;
+  }
   else
   {
     //To find the balancing coefficients, if the radix is 2,
@@ -150,53 +146,61 @@ bool companion<_Scalar,_Deg>::balanced( Scalar colNorm, Scalar rowNorm,
     // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
     // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
     // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
-    rowB = rowNorm / radix<Scalar>();
-    colB = Scalar(1);
-    const Scalar s = colNorm + rowNorm;
-
-    while (colNorm < rowB)
+    const RealScalar radix = RealScalar(2);
+    const RealScalar radix2 = RealScalar(4);
+    
+    rowB = rowNorm / radix;
+    colB = RealScalar(1);
+    const RealScalar s = colNorm + rowNorm;
+
+    // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm
+    RealScalar scout = colNorm;
+    while (scout < rowB)
     {
-      colB *= radix<Scalar>();
-      colNorm *= radix2<Scalar>();
+      colB *= radix;
+      scout *= radix2;
     }
-
-    rowB = rowNorm * radix<Scalar>();
-
-    while (colNorm >= rowB)
+    
+    // We now have an upper-bound for sigma, try to lower it.
+    // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm
+    scout = colNorm * (colB / radix) * colB;  // Avoid overflow.
+    while (scout >= rowNorm)
     {
-      colB /= radix<Scalar>();
-      colNorm /= radix2<Scalar>();
+      colB /= radix;
+      scout /= radix2;
     }
 
-    //This line is used to avoid insubstantial balancing
-    if ((rowNorm + colNorm) < Scalar(0.95) * s * colB)
+    // This line is used to avoid insubstantial balancing.
+    if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB)
     {
       isBalanced = false;
-      rowB = Scalar(1) / colB;
+      rowB = RealScalar(1) / colB;
       return false;
     }
-    else{
-      return true; }
+    else
+    {
+      return true;
+    }
   }
 }
 
 template< typename _Scalar, int _Deg >
 inline
-bool companion<_Scalar,_Deg>::balancedR( Scalar colNorm, Scalar rowNorm,
-    bool& isBalanced, Scalar& colB, Scalar& rowB )
+bool companion<_Scalar,_Deg>::balancedR( RealScalar colNorm, RealScalar rowNorm,
+    bool& isBalanced, RealScalar& colB, RealScalar& rowB )
 {
-  if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
+  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
   else
   {
     /**
      * Set the norm of the column and the row to the geometric mean
      * of the row and column norm
      */
-    const _Scalar q = colNorm/rowNorm;
+    const RealScalar q = colNorm/rowNorm;
     if( !isApprox( q, _Scalar(1) ) )
     {
       rowB = sqrt( colNorm/rowNorm );
-      colB = Scalar(1)/rowB;
+      colB = RealScalar(1)/rowB;
 
       isBalanced = false;
       return false;
@@ -219,8 +223,8 @@ void companion<_Scalar,_Deg>::balance()
   while( !hasConverged )
   {
     hasConverged = true;
-    Scalar colNorm,rowNorm;
-    Scalar colB,rowB;
+    RealScalar colNorm,rowNorm;
+    RealScalar colB,rowB;
 
     //First row, first column excluding the diagonal
     //==============================================
diff --git a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
index 03198ec8e..5e0ecbb43 100644
--- a/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+++ b/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
@@ -99,7 +99,7 @@ class PolynomialSolverBase
      */
     inline const RootType& greatestRoot() const
     {
-      std::greater<Scalar> greater;
+      std::greater<RealScalar> greater;
       return selectComplexRoot_withRespectToNorm( greater );
     }
 
@@ -108,7 +108,7 @@ class PolynomialSolverBase
      */
     inline const RootType& smallestRoot() const
     {
-      std::less<Scalar> less;
+      std::less<RealScalar> less;
       return selectComplexRoot_withRespectToNorm( less );
     }
 
@@ -126,7 +126,7 @@ class PolynomialSolverBase
 
       for( Index i=0; i<m_roots.size(); ++i )
       {
-        if( abs( m_roots[i].imag() ) < absImaginaryThreshold )
+        if( abs( m_roots[i].imag() ) <= absImaginaryThreshold )
         {
           if( !hasArealRoot )
           {
@@ -144,10 +144,10 @@ class PolynomialSolverBase
             }
           }
         }
-        else
+        else if(!hasArealRoot)
         {
           if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
-            res = i; }
+            res = i;}
         }
       }
       return numext::real_ref(m_roots[res]);
@@ -167,7 +167,7 @@ class PolynomialSolverBase
 
       for( Index i=0; i<m_roots.size(); ++i )
       {
-        if( abs( m_roots[i].imag() ) < absImaginaryThreshold )
+        if( abs( m_roots[i].imag() ) <= absImaginaryThreshold )
         {
           if( !hasArealRoot )
           {
@@ -213,7 +213,7 @@ class PolynomialSolverBase
         bool& hasArealRoot,
         const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
     {
-      std::greater<Scalar> greater;
+      std::greater<RealScalar> greater;
       return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold );
     }
 
@@ -236,7 +236,7 @@ class PolynomialSolverBase
         bool& hasArealRoot,
         const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
     {
-      std::less<Scalar> less;
+      std::less<RealScalar> less;
       return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold );
     }
 
@@ -259,7 +259,7 @@ class PolynomialSolverBase
         bool& hasArealRoot,
         const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
     {
-      std::greater<Scalar> greater;
+      std::greater<RealScalar> greater;
       return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold );
     }
 
@@ -282,7 +282,7 @@ class PolynomialSolverBase
         bool& hasArealRoot,
         const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
     {
-      std::less<Scalar> less;
+      std::less<RealScalar> less;
       return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold );
     }
 
@@ -327,7 +327,7 @@ class PolynomialSolverBase
   * However, almost always, correct accuracy is reached even in these cases for 64bit
   * (double) floating types and small polynomial degree (<20).
   */
-template< typename _Scalar, int _Deg >
+template<typename _Scalar, int _Deg>
 class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
 {
   public:
@@ -337,7 +337,10 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
     EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
 
     typedef Matrix<Scalar,_Deg,_Deg>                 CompanionMatrixType;
-    typedef EigenSolver<CompanionMatrixType>         EigenSolverType;
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                                          ComplexEigenSolver<CompanionMatrixType>,
+                                          EigenSolver<CompanionMatrixType> >::type EigenSolverType;
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex, Scalar, std::complex<Scalar> >::type ComplexScalar;
 
   public:
     /** Computes the complex roots of a new polynomial. */
@@ -352,6 +355,25 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
         companion.balance();
         m_eigenSolver.compute( companion.denseMatrix() );
         m_roots = m_eigenSolver.eigenvalues();
+        // cleanup noise in imaginary part of real roots:
+        // if the imaginary part is rather small compared to the real part
+        // and that cancelling the imaginary part yield a smaller evaluation,
+        // then it's safe to keep the real part only.
+        RealScalar coarse_prec = RealScalar(std::pow(4,poly.size()+1))*NumTraits<RealScalar>::epsilon();
+        for(Index i = 0; i<m_roots.size(); ++i)
+        {
+          if( internal::isMuchSmallerThan(numext::abs(numext::imag(m_roots[i])),
+                                          numext::abs(numext::real(m_roots[i])),
+                                          coarse_prec) )
+          {
+            ComplexScalar as_real_root = ComplexScalar(numext::real(m_roots[i]));
+            if(    numext::abs(poly_eval(poly, as_real_root))
+                <= numext::abs(poly_eval(poly, m_roots[i])))
+            {
+              m_roots[i] = as_real_root;
+            }
+          }
+        }
       }
       else if(poly.size () == 2)
       {
diff --git a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
index 40ba65b7e..394e857ac 100644
--- a/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
+++ b/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
@@ -20,8 +20,8 @@ namespace Eigen {
  *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
  * \param[in] x : the value to evaluate the polynomial at.
  *
- * <i><b>Note for stability:</b></i>
- *  <dd> \f$ |x| \le 1 \f$ </dd>
+ * \note for stability:
+ *   \f$ |x| \le 1 \f$
  */
 template <typename Polynomials, typename T>
 inline
@@ -67,8 +67,8 @@ T poly_eval( const Polynomials& poly, const T& x )
  *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
  *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
  *
- *  <i><b>Precondition:</b></i>
- *  <dd> the leading coefficient of the input polynomial poly must be non zero </dd>
+ *  \pre
+ *   the leading coefficient of the input polynomial poly must be non zero
  */
 template <typename Polynomial>
 inline
diff --git a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
index a1f54ed35..6d0370d5b 100644
--- a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+++ b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
@@ -41,7 +41,7 @@ public:
 
     /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
      *
-     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
+     * Setting a value greater than zero speeds up computation, and yields to an incomplete
      * factorization with fewer non zero coefficients. Such approximate factors are especially
      * useful to initialize an iterative solver.
      *
@@ -349,4 +349,4 @@ bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBa
 
 } // end namespace Eigen
 
-#endif // EIGEN_SKYLINELU_H
+#endif // EIGEN_SKYLINEINPLACELU_H
diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
index a2a8933ca..7c7eace7f 100644
--- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@@ -206,26 +206,26 @@ public:
             if (col > row) //upper matrix
             {
                 const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
             }
             if (col < row) //lower matrix
             {
                 const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
             }
         } else {
             if (outer > inner) //upper matrix
             {
                 const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
             }
             if (outer < inner) //lower matrix
             {
                 const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
             }
         }
@@ -300,11 +300,11 @@ public:
 
         if (IsRowMajor) {
             const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
         } else {
             const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
         }
     }
@@ -336,11 +336,11 @@ public:
 
         if (IsRowMajor) {
             const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
         } else {
             const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
         }
     }
@@ -859,4 +859,4 @@ protected:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SkylineMatrix_H
+#endif // EIGEN_SKYLINEMATRIX_H
diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
index b3a237230..b0d5e1001 100644
--- a/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
@@ -12,7 +12,7 @@
 
 #include "SkylineUtil.h"
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \ingroup Skyline_Module
  *
@@ -102,18 +102,18 @@ public:
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
     /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-    inline Index rows() const {
+    inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
         return derived().rows();
     }
 
     /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-    inline Index cols() const {
+    inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
         return derived().cols();
     }
 
     /** \returns the number of coefficients, which is \a rows()*cols().
      * \sa rows(), cols(), SizeAtCompileTime. */
-    inline Index size() const {
+    inline EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT {
         return rows() * cols();
     }
 
@@ -209,4 +209,4 @@ protected:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SkylineMatrixBase_H
+#endif // EIGEN_SKYLINEMATRIXBASE_H
diff --git a/unsupported/Eigen/src/Skyline/SkylineStorage.h b/unsupported/Eigen/src/Skyline/SkylineStorage.h
index 378a8deb4..cc7514f12 100644
--- a/unsupported/Eigen/src/Skyline/SkylineStorage.h
+++ b/unsupported/Eigen/src/Skyline/SkylineStorage.h
@@ -256,4 +256,4 @@ public:
 
 } // end namespace Eigen
 
-#endif // EIGEN_COMPRESSED_STORAGE_H
+#endif // EIGEN_SKYLINE_STORAGE_H
diff --git a/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
index 0e8350a7d..536a0c320 100644
--- a/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
@@ -931,7 +931,7 @@ class BlockSparseMatrix : public SparseMatrixBase<BlockSparseMatrix<_Scalar,_Blo
     }
 
     /**
-      * \returns the starting position of the block <id> in the array of values
+      * \returns the starting position of the block \p id in the array of values
       */
     Index blockPtr(Index id) const
     {
diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index 037a13f86..42c99e467 100644
--- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -187,7 +187,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
     /** Does nothing: provided for compatibility with SparseMatrix */
     inline void finalize() {}
 
-    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
     void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       for (Index j=0; j<outerSize(); ++j)
@@ -224,31 +224,43 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       }
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix()
       : m_innerSize(0), m_data(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       eigen_assert(innerSize()==0 && outerSize()==0);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
       : m_innerSize(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       resize(rows, cols);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     template<typename OtherDerived>
     EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
       : m_innerSize(0)
     {
-    Base::operator=(other.derived());
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
+      Base::operator=(other.derived());
     }
 
     inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
       : Base(), m_innerSize(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       *this = other.derived();
     }
 
diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index cdc14f86e..dd786d561 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -12,38 +12,38 @@
 #define EIGEN_SPARSE_MARKET_IO_H
 
 #include <iostream>
+#include <vector>
 
 namespace Eigen { 
 
 namespace internal 
 {
-  template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value)
+  template <typename Scalar, typename StorageIndex>
+  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value)
   {
-    line >> i >> j >> value;
-    i--;
-    j--;
-    if(i>=0 && j>=0 && i<M && j<N)
-    {
-      return true; 
-    }
-    else
-      return false;
+    std::stringstream sline(line);
+    sline >> i >> j >> value;
   }
-  template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value)
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value)
+  { std::sscanf(line, "%d %d %g", &i, &j, &value); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value)
+  { std::sscanf(line, "%d %d %lg", &i, &j, &value); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value)
+  { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value)
+  { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+  template <typename Scalar, typename StorageIndex>
+  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value)
   {
+    std::stringstream sline(line);
     Scalar valR, valI;
-    line >> i >> j >> valR >> valI;
-    i--;
-    j--;
-    if(i>=0 && j>=0 && i<M && j<N)
-    {
-      value = std::complex<Scalar>(valR, valI);
-      return true; 
-    }
-    else
-      return false;
+    sline >> i >> j >> valR >> valI;
+    value = std::complex<Scalar>(valR,valI);
   }
 
   template <typename RealScalar>
@@ -81,13 +81,13 @@ namespace internal
     }
   }
 
-  template<typename Scalar>
-  inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out)
+  template<typename Scalar, typename StorageIndex>
+  inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out)
   {
     out << row << " "<< col << " " << value << "\n";
   }
-  template<typename Scalar>
-  inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out)
+  template<typename Scalar, typename StorageIndex>
+  inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out)
   {
     out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
   }
@@ -101,14 +101,15 @@ namespace internal
   template<typename Scalar>
   inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
   {
-    out << value.real << " " << value.imag()<< "\n"; 
+    out << value.real() << " " << value.imag()<< "\n"; 
   }
 
-} // end namepsace internal
+} // end namespace internal
 
 inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
 {
   sym = 0; 
+  iscomplex = false;
   isvector = false;
   std::ifstream in(filename.c_str(),std::ios::in);
   if(!in)
@@ -133,17 +134,20 @@ template<typename SparseMatrixType>
 bool loadMarket(SparseMatrixType& mat, const std::string& filename)
 {
   typedef typename SparseMatrixType::Scalar Scalar;
-  typedef typename SparseMatrixType::Index Index;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
   std::ifstream input(filename.c_str(),std::ios::in);
   if(!input)
     return false;
+
+  char rdbuffer[4096];
+  input.rdbuf()->pubsetbuf(rdbuffer, 4096);
   
   const int maxBuffersize = 2048;
   char buffer[maxBuffersize];
   
   bool readsizes = false;
 
-  typedef Triplet<Scalar,Index> T;
+  typedef Triplet<Scalar,StorageIndex> T;
   std::vector<T> elements;
   
   Index M(-1), N(-1), NNZ(-1);
@@ -154,33 +158,36 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
     //NOTE An appropriate test should be done on the header to get the  symmetry
     if(buffer[0]=='%')
       continue;
-    
-    std::stringstream line(buffer);
-    
+
     if(!readsizes)
     {
+      std::stringstream line(buffer);
       line >> M >> N >> NNZ;
-      if(M > 0 && N > 0 && NNZ > 0) 
+      if(M > 0 && N > 0)
       {
         readsizes = true;
-        //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
         mat.resize(M,N);
         mat.reserve(NNZ);
       }
     }
     else
     { 
-      Index i(-1), j(-1);
+      StorageIndex i(-1), j(-1);
       Scalar value; 
-      if( internal::GetMarketLine(line, M, N, i, j, value) ) 
+      internal::GetMarketLine(buffer, i, j, value);
+
+      i--;
+      j--;
+      if(i>=0 && j>=0 && i<M && j<N)
       {
-        ++ count;
+        ++count;
         elements.push_back(T(i,j,value));
       }
-      else 
+      else
         std::cerr << "Invalid read: " << i << "," << j << "\n";        
     }
   }
+
   mat.setFromTriplets(elements.begin(), elements.end());
   if(count!=NNZ)
     std::cerr << count << "!=" << NNZ << "\n";
@@ -225,12 +232,13 @@ template<typename SparseMatrixType>
 bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
 {
   typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::RealScalar RealScalar;
   std::ofstream out(filename.c_str(),std::ios::out);
   if(!out)
     return false;
   
   out.flags(std::ios_base::scientific);
-  out.precision(64);
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
   std::string header; 
   internal::putMarketHeader<Scalar>(header, sym); 
   out << header << std::endl; 
@@ -241,7 +249,6 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
     {
       ++ count;
       internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
-      // out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
     }
   out.close();
   return true;
@@ -250,13 +257,14 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
 template<typename VectorType>
 bool saveMarketVector (const VectorType& vec, const std::string& filename)
 {
- typedef typename VectorType::Scalar Scalar; 
+ typedef typename VectorType::Scalar Scalar;
+ typedef typename VectorType::RealScalar RealScalar;
  std::ofstream out(filename.c_str(),std::ios::out);
   if(!out)
     return false;
   
   out.flags(std::ios_base::scientific);
-  out.precision(64);
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
   if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
       out << "%%MatrixMarket matrix array complex general\n"; 
   else
diff --git a/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index ee97299af..985702b5f 100644
--- a/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -10,7 +10,13 @@
 #ifndef EIGEN_RANDOMSETTER_H
 #define EIGEN_RANDOMSETTER_H
 
-namespace Eigen { 
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+// Ensure the ::google namespace exists, required for checking existence of 
+// ::google::dense_hash_map and ::google::sparse_hash_map.
+namespace google {}
+#endif
+
+namespace Eigen {
 
 /** Represents a std::map
   *
@@ -56,7 +62,26 @@ template<typename Scalar> struct StdUnorderedMapTraits
 };
 #endif // EIGEN_UNORDERED_MAP_SUPPORT
 
-#ifdef _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+
+namespace google {
+  
+// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
+// are in the global namespace, and other times they are under ::google.
+using namespace ::google;
+
+template<typename KeyType, typename Scalar>
+struct DenseHashMap {
+  typedef dense_hash_map<KeyType, Scalar> type;
+};
+
+template<typename KeyType, typename Scalar>
+struct SparseHashMap {
+  typedef sparse_hash_map<KeyType, Scalar> type;
+};
+
+} // namespace google
+
 /** Represents a google::dense_hash_map
   *
   * \see RandomSetter
@@ -64,7 +89,7 @@ template<typename Scalar> struct StdUnorderedMapTraits
 template<typename Scalar> struct GoogleDenseHashMapTraits
 {
   typedef int KeyType;
-  typedef google::dense_hash_map<KeyType,Scalar> Type;
+  typedef typename google::DenseHashMap<KeyType,Scalar>::type Type;
   enum {
     IsSorted = 0
   };
@@ -72,9 +97,7 @@ template<typename Scalar> struct GoogleDenseHashMapTraits
   static void setInvalidKey(Type& map, const KeyType& k)
   { map.set_empty_key(k); }
 };
-#endif
 
-#ifdef _SPARSE_HASH_MAP_H_
 /** Represents a google::sparse_hash_map
   *
   * \see RandomSetter
@@ -82,7 +105,7 @@ template<typename Scalar> struct GoogleDenseHashMapTraits
 template<typename Scalar> struct GoogleSparseHashMapTraits
 {
   typedef int KeyType;
-  typedef google::sparse_hash_map<KeyType,Scalar> Type;
+  typedef typename google::SparseHashMap<KeyType,Scalar>::type Type;
   enum {
     IsSorted = 0
   };
@@ -134,18 +157,17 @@ template<typename Scalar> struct GoogleSparseHashMapTraits
   * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
   *
   * For performance and memory consumption reasons it is highly recommended to use one of
-  * the Google's hash_map implementation. To enable the support for them, you have two options:
-  *  - \#include <google/dense_hash_map> yourself \b before Eigen/Sparse header
-  *  - define EIGEN_GOOGLEHASH_SUPPORT
-  * In the later case the inclusion of <google/dense_hash_map> is made for you.
+  * Google's hash_map implementations. To enable the support for them, you must define
+  * EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
+  * <google/sparse_hash_map> for you.
   *
-  * \see http://code.google.com/p/google-sparsehash/
+  * \see https://github.com/sparsehash/sparsehash
   */
 template<typename SparseMatrixType,
          template <typename T> class MapTraits =
-#if defined _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
           GoogleDenseHashMapTraits
-#elif defined _HASH_MAP
+#elif defined(_HASH_MAP)
           GnuHashMapTraits
 #else
           StdMapTraits
@@ -249,10 +271,10 @@ class RandomSetter
           }
         }
         // prefix sum
-        Index count = 0;
+        StorageIndex count = 0;
         for (Index j=0; j<mp_target->outerSize(); ++j)
         {
-          Index tmp = positions[j];
+          StorageIndex tmp = positions[j];
           mp_target->outerIndexPtr()[j] = count;
           positions[j] = count;
           count += tmp;
@@ -281,7 +303,7 @@ class RandomSetter
               mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
               --i;
             }
-            mp_target->innerIndexPtr()[i+1] = inner;
+            mp_target->innerIndexPtr()[i+1] = internal::convert_index<StorageIndex>(inner);
             mp_target->valuePtr()[i+1] = it->second.value;
           }
         }
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
new file mode 100644
index 000000000..41d2bf61c
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
@@ -0,0 +1,286 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \returns an expression of the coefficient-wise i0(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
+bessel_i0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i0e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
+bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
+bessel_i1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of i1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_i1e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
+bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
+bessel_k0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k0e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k0e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
+bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
+bessel_k1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1e(\a x) to the given
+ * arrays.
+  *
+  * It returns the exponentially scaled modified Bessel
+  * function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of k1e(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_k1e()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
+bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j0(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the first kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of j0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_j0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
+bessel_j0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y0(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the second kind of order zero.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of y0(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_y0()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
+bessel_y0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j1(\a x) to the given
+ * arrays.
+  *
+  * It returns the modified Bessel function of the first kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of j1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_j1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
+bessel_j1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y1(\a x) to the given
+ * arrays.
+  *
+  * It returns the Bessel function of the second kind of order one.
+  *
+  * \param x is the argument
+  *
+  * \note This function supports only float and double scalar types. To support
+  * other scalar types, the user has to provide implementations of y1(T) for
+  * any scalar type T to be supported.
+  *
+  * \sa ArrayBase::bessel_y1()
+  */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+    Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
+bessel_y1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<
+      Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>,
+      const Derived>(x.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
new file mode 100644
index 000000000..6049cc2fe
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_BFLOAT16_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
new file mode 100644
index 000000000..8606a9f8e
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order zero.
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0;
+    return bessel_i0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i0e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0e;
+    return bessel_i0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1;
+    return bessel_i1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i1e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1e;
+    return bessel_i1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_i1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_j0()
+ */
+template <typename Scalar>
+struct scalar_bessel_j0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j0;
+    return bessel_j0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_j0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_y0()
+ */
+template <typename Scalar>
+struct scalar_bessel_y0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y0;
+    return bessel_y0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_y0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j0 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the first kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1()
+ */
+template <typename Scalar>
+struct scalar_bessel_j1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j1;
+    return bessel_j1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_j1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_y1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y1;
+    return bessel_y1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_y1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j1 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the second
+ * kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0;
+    return bessel_k0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k0(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0e;
+    return bessel_k0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k0e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the
+ * second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1;
+    return bessel_k1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k1(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1e_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1e_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1e;
+    return bessel_k1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return internal::pbessel_k1e(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_FUNCTORS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
new file mode 100644
index 000000000..8930d1a3c
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
@@ -0,0 +1,66 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
+#define EIGEN_BESSELFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_HALF_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
new file mode 100644
index 000000000..24812be1b
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
@@ -0,0 +1,1959 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSEL_FUNCTIONS_H
+#define EIGEN_BESSEL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes                       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct bessel_i0e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0ef.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i0ef();
+     *
+     * y = i0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        100000      3.7e-7      7.0e-8
+     * See i0f().
+     *
+     */
+
+    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
+                       -2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
+                       -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,
+                       -5.76375574538582365885E-4f, 1.63947561694133579842E-3f,
+                       -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,
+                       -9.49010970480476444210E-2f, 1.71620901522208775349E-1f,
+                       -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f,
+                       2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
+                       6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+                       8.04490411014108831608E-1f};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 18>::run(
+        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 7>::run(
+            psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0e.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i0e();
+     *
+     * y = i0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       5.4e-16     1.2e-16
+     * See i0().
+     *
+     */
+
+    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
+                        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+                        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+                        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+                        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+                        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+                        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+                        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+                        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+                        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+                        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+    const double B[] = {
+        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+        4.46562142029675999901E-17,  3.46122286769746109310E-17,
+        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+        1.77256013305652638360E-15,  3.81168066935262242075E-15,
+        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+        1.54008621752140982691E-14,  3.85277838274214270114E-13,
+        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+        1.18891471078464383424E-11,  4.94060238822496958910E-10,
+        3.39623202570838634515E-9,   2.26666899049817806459E-8,
+        2.04891858946906374183E-7,   2.89137052083475648297E-6,
+        6.88975834691682398426E-5,   3.36911647825569408990E-3,
+        8.04490411014108831608E-1};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 30>::run(
+        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct bessel_i0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i0e<T>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct bessel_i0_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(
+        pexp(pabs(x)),
+        generic_i0e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i0<T>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct bessel_i1e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type >
+struct generic_i1e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* i1ef.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i1ef();
+     *
+     * y = i1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
+     * See i1().
+     *
+     */
+    const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
+                       2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
+                       3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
+                       4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
+                       5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
+                       4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
+                       2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
+                       1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
+                       2.52587186443633654823E-1f};
+
+    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
+                       -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
+                       -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+                       7.78576235018280120474E-1f};
+
+
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
+        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 7>::run(
+            psub(pdiv(pset1<T>(32.0f), y),
+                 pset1<T>(2.0f)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i1e.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i1e();
+     *
+     * y = i1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
+     * See i1().
+     *
+     */
+    const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17,
+                        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+                        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+                        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+                        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+                        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+                        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+                        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+                        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+                        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+                        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+                        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+                        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+                        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+                        2.52587186443633654823E-1};
+    const double B[] = {
+        7.51729631084210481353E-18,  4.41434832307170791151E-18,
+        -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+        2.96262899764595013876E-16,  3.30820231092092828324E-16,
+        -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+        1.04202769841288027642E-14,  4.27244001671195135429E-14,
+        -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+        -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+        1.41258074366137813316E-11,  3.25260358301548823856E-11,
+        -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+        -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+        -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+        7.78576235018280120474E-1};
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
+        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
+    T y_gt_eight = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(32.0), y),
+                 pset1<T>(2.0)), B),
+        prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct bessel_i1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i1e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_i1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(
+        pexp(pabs(x)),
+        generic_i1e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_i1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0ef.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0ef();
+     *
+     * y = k0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       8.1e-7      7.8e-8
+     * See k0().
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, float>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    T x_gt_two = pmul(
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x));
+    return pselect(
+        pcmp_le(x, pset1<T>(0.0)),
+        MAXNUM,
+        pselect(pcmp_le(x, two), x_le_two, x_gt_two));
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0e.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0e();
+     *
+     * y = k0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+
+    const double A[] = {
+      1.37446543561352307156E-16,
+      4.25981614279661018399E-14,
+      1.03496952576338420167E-11,
+      1.90451637722020886025E-9,
+      2.53479107902614945675E-7,
+      2.28621210311945178607E-5,
+      1.26461541144692592338E-3,
+      3.59799365153615016266E-2,
+      3.44289899924628486886E-1,
+      -5.35327393233902768720E-1};
+    const double B[] = {
+       5.30043377268626276149E-18, -1.64758043015242134646E-17,
+       5.21039150503902756861E-17, -1.67823109680541210385E-16,
+       5.51205597852431940784E-16, -1.84859337734377901440E-15,
+       6.34007647740507060557E-15, -2.22751332699166985548E-14,
+       8.03289077536357521100E-14, -2.98009692317273043925E-13,
+       1.14034058820847496303E-12, -4.51459788337394416547E-12,
+       1.85594911495471785253E-11, -7.95748924447710747776E-11,
+       3.57739728140030116597E-10, -1.69753450938905987466E-9,
+       8.57403401741422608519E-9, -4.66048989768794782956E-8,
+       2.76681363944501510342E-7, -1.83175552271911948767E-6,
+       1.39498137188764993662E-5, -1.28495495816278026384E-4,
+       1.56988388573005337491E-3, -3.14481013119645005427E-2,
+       2.44030308206595545468E0
+    };
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, double>::run(x), pmul(
+            pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k0e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0f.c
+     *	Modified Bessel function, third kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0f();
+     *
+     * y = k0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns modified Bessel function of the third kind
+     * of order zero of the argument.
+     *
+     * The range is partitioned into the two intervals [0,8] and
+     * (8, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Tested at 2000 random points between 0 and 8.  Peak absolute
+     * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-7      8.5e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     *  K0 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, float>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pmul(
+            pexp(pnegate(x)),
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B)),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*
+     *
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0();
+     *
+     * y = k0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+    const double A[] = {
+      1.37446543561352307156E-16,
+      4.25981614279661018399E-14,
+      1.03496952576338420167E-11,
+      1.90451637722020886025E-9,
+      2.53479107902614945675E-7,
+      2.28621210311945178607E-5,
+      1.26461541144692592338E-3,
+      3.59799365153615016266E-2,
+      3.44289899924628486886E-1,
+      -5.35327393233902768720E-1};
+    const double B[] = {
+       5.30043377268626276149E-18, -1.64758043015242134646E-17,
+       5.21039150503902756861E-17, -1.67823109680541210385E-16,
+       5.51205597852431940784E-16, -1.84859337734377901440E-15,
+       6.34007647740507060557E-15, -2.22751332699166985548E-14,
+       8.03289077536357521100E-14, -2.98009692317273043925E-13,
+       1.14034058820847496303E-12, -4.51459788337394416547E-12,
+       1.85594911495471785253E-11, -7.95748924447710747776E-11,
+       3.57739728140030116597E-10, -1.69753450938905987466E-9,
+       8.57403401741422608519E-9, -4.66048989768794782956E-8,
+       2.76681363944501510342E-7, -1.83175552271911948767E-6,
+       1.39498137188764993662E-5, -1.28495495816278026384E-4,
+       1.56988388573005337491E-3, -3.14481013119645005427E-2,
+       2.44030308206595545468E0
+    };
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(
+        generic_i0<T, double>::run(x), pnegate(
+            plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pmul(
+            pexp(-x),
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B)),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1e {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1ef.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1ef();
+     *
+     * y = k1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.9e-7      6.7e-8
+     * See k1().
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                        1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        internal::pchebevl<T, 10>::run(
+            psub(pdiv(pset1<T>(8.0), x), two), B),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1e.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k1e();
+     *
+     * y = k1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-16     1.2e-16
+     * See k1().
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+                        1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        internal::pchebevl<T, 25>::run(
+            psub(pdiv(pset1<T>(8.0), x), two), B),
+        prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k1e<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_k1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1f.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                        1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pexp(pnegate(x)),
+        pmul(
+            internal::pchebevl<T, 10>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+                        1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+        pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(
+        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(
+        pexp(-x),
+        pmul(
+            internal::pchebevl<T, 25>::run(
+                psub(pdiv(pset1<T>(8.0), x), two), B),
+            prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_k1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_j0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j0f();
+     *
+     * y = j0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval the following polynomial
+     * approximation is used:
+     *
+     *
+     *        2         2         2
+     * (w - r  ) (w - r  ) (w - r  ) P(w)
+     *       1         2         3
+     *
+     *            2
+     * where w = x  and the three r's are zeros of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 2        100000      1.3e-7      3.6e-8
+     *    IEEE      2, 32       100000      1.9e-7      5.4e-8
+     *
+     */
+
+    const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f,
+                        -3.969646342510940E-004f, 1.332913422519003E-002f,
+                        -1.729150680240724E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+                        -2.145007480346739E-001f, 1.197549369473540E-001f,
+                        -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+                        1.756221482109099E+001f, -4.974978466280903E+000f,
+                        1.001973420681837E+000f, -1.939906941791308E-001f,
+                        6.490598792654666E-002f, -1.249992184872738E-001f};
+    const T DR1 =  pset1<T>(5.78318596294678452118f);
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pselect(
+        pcmp_lt(y, pset1<T>(1.0e-3f)),
+        pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
+        pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j0();
+     *
+     * y = j0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval the following rational
+     * approximation is used:
+     *
+     *
+     *        2         2
+     * (w - r  ) (w - r  ) P (w) / Q (w)
+     *       1         2    3       8
+     *
+     *            2
+     * where w = x  and the two r's are zeros of the function.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30       10000       4.4e-17     6.3e-18
+     *    IEEE      0, 30       60000       4.2e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+                        1.23953371646414299388E0, 5.44725003058768775090E0,
+                        8.74716500199817011941E0, 5.30324038235394892183E0,
+                        9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+                         1.25352743901058953537E0, 5.47097740330417105182E0,
+                         8.76190883237069594232E0, 5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+                         -1.95539544257735972385E1, -9.32060152123768231369E1,
+                         -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1, -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+                         8.56430025976980587198E2, 3.88240183605401609683E3,
+                         7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12,
+                         -2.49248344360967716204E14, 9.70862251047306323952E15};
+    const double RQ[] = {1.00000000000000000000E0, 4.99563147152651017219E2,
+                         1.73785401676374683123E5, 4.84409658339962045305E7,
+                         1.11855537045356834862E10, 2.11277520115489217587E12,
+                         3.10518229857422583814E14, 3.18121955943204943306E16,
+                         1.71086294081043136091E18};
+    const T DR1 = pset1<T>(5.78318596294678452118E0);
+    const T DR2 = pset1<T>(3.04712623436620863991E1);
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* pi / 4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pselect(
+        pcmp_lt(y, pset1<T>(1.0e-5)),
+        pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
+        pmul(pmul(psub(z, DR1), psub(z, DR2)),
+             pdiv(internal::ppolevl<T, 3>::run(z, RP),
+                  internal::ppolevl<T, 8>::run(z, RQ))));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_PIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_j0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_y0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y0 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     * 	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, y0f();
+     *
+     * y = y0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2         2         2
+     * y0(x)  =  (w - r  ) (w - r  ) (w - r  ) R(x)  +  2/pi ln(x) j0(x).
+     *                 1         2         3
+     *
+     * Thus a call to j0() is required.  The three zeros are removed
+     * from R(x) to improve its numerical stability.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,  2       100000      2.4e-7      3.4e-8
+     *    IEEE      2, 32       100000      1.8e-7      5.3e-8
+     *
+     */
+
+    const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f,
+                        5.344486707214273E-004f, -1.584289289821316E-002f,
+                        1.707584643733568E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+                        -2.145007480346739E-001f, 1.197549369473540E-001f,
+                        -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+                        1.756221482109099E+001f, -4.974978466280903E+000f,
+                        1.001973420681837E+000f, -1.939906941791308E-001f,
+                        6.490598792654666E-002f, -1.249992184872738E-001f};
+    const T YZ1 = pset1<T>(0.43221455686510834878f);
+    const T TWOOPI =  pset1<T>(0.636619772367581343075535f); /* 2 / pi */
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+    T z = pmul(x, x);
+    T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
+    x_le_two = pmadd(
+        psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    T u = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y0();
+     *
+     * y = y0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *   y0(x)  = R(x)  +   2 * log(x) * j0(x) / PI.
+     * Thus a call to j0() is required.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30        9400       7.0e-17     7.9e-18
+     *    IEEE      0, 30       30000       1.3e-15     1.6e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+                        1.23953371646414299388E0, 5.44725003058768775090E0,
+                        8.74716500199817011941E0, 5.30324038235394892183E0,
+                        9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+                         1.25352743901058953537E0, 5.47097740330417105182E0,
+                         8.76190883237069594232E0, 5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+                         -1.95539544257735972385E1, -9.32060152123768231369E1,
+                         -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1, -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+                         8.56430025976980587198E2, 3.88240183605401609683E3,
+                         7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double YP[] = {1.55924367855235737965E4, -1.46639295903971606143E7,
+                         5.43526477051876500413E9, -9.82136065717911466409E11,
+                         8.75906394395366999549E13, -3.46628303384729719441E15,
+                         4.42733268572569800351E16, -1.84950800436986690637E16};
+    const double YQ[] = {1.00000000000000000000E0,  1.04128353664259848412E3,
+                         6.26107330137134956842E5, 2.68919633393814121987E8,
+                         8.64002487103935000337E10, 2.02979612750105546709E13,
+                         3.17157752842975028269E15, 2.50596256172653059228E17};
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T TWOOPI =  pset1<T>(0.636619772367581343075535); /* 2 / pi */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP),
+                       internal::ppolevl<T, 7>::run(z, YQ));
+    x_le_five = pmadd(
+        pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_PIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y0_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_y0<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_j1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j1f();
+     *
+     * y = j1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a polynomial approximation
+     *        2
+     * (w - r  ) x P(w)
+     *       1
+     *                     2
+     * is used, where w = x  and r is the first zero of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak       rms
+     *    IEEE      0,  2       100000       1.2e-7     2.5e-8
+     *    IEEE      2, 32       100000       2.0e-7     5.3e-8
+     *
+     *
+     */
+
+    const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f,
+                        -4.541343896997497E-005f, 1.937383947804541E-003f,
+                        -3.405537384615824E-002f};
+    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+                        3.138238455499697E-001f, -2.102302420403875E-001f,
+                        5.435364690523026E-003f, 1.493389585089498E-001f,
+                        4.976029650847191E-006f, 7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+                        -2.485774108720340E+001f, 7.222973196770240E+000f,
+                        -1.544842782180211E+000f, 3.503787691653334E-001f,
+                        -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T Z1 = pset1<T>(1.46819706421238932572E1f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pmul(
+        psub(z, Z1),
+        pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
+    y_gt_two = pselect(
+        pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
+    return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j1();
+     *
+     * y = j1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 24 term Chebyshev
+     * expansion is used. In the second, the asymptotic
+     * trigonometric representation is employed using two
+     * rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       4.0e-17     1.1e-17
+     *    IEEE      0, 30       30000       2.6e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+                         1.12719608129684925192E0, 5.11207951146807644818E0,
+                         8.42404590141772420927E0, 5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+                         1.10514232634061696926E0, 5.07386386128601488557E0,
+                         8.39985554327604159757E0, 5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+                         7.58238284132545283818E1, 3.66779609360150777800E2,
+                         7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2, 2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+                         1.05644886038262816351E3, 4.98641058337653607651E3,
+                         9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11,
+                         -7.27494245221818276015E13, 3.68295732863852883286E15};
+    const double RQ[] = {1.00000000000000000000E0, 6.20836478118054335476E2,
+                         2.56987256757748830383E5, 8.35146791431949253037E7,
+                         2.21511595479792499675E10, 4.74914122079991414898E12,
+                         7.84369607876235854894E14, 8.95222336184627338078E16,
+                         5.32278620332680085395E18};
+    const T Z1 = pset1<T>(1.46819706421238932572E1);
+    const T Z2 = pset1<T>(4.92184563216946036703E1);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP),
+                       internal::ppolevl<T, 8>::run(z, RQ));
+    y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_THPIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
+    y_gt_five = pselect(
+        pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_j1<T>::run(x);
+  }
+};
+
+template <typename T>
+struct bessel_y1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y1 {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T&) {
+    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return ScalarType(0);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2
+     * y0(x)  =  (w - r  ) x R(x^2)  +  2/pi (ln(x) j1(x) - 1/x) .
+     *                 1
+     *
+     * Thus a call to j1() is required.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    IEEE      0,  2       100000       2.2e-7     4.6e-8
+     *    IEEE      2, 32       100000       1.9e-7     5.3e-8
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+
+    const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f,
+                        6.719543806674249E-005f, -2.641785726447862E-003f,
+                        4.202369946500099E-002f};
+    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+                        3.138238455499697E-001f, -2.102302420403875E-001f,
+                        5.435364690523026E-003f, 1.493389585089498E-001f,
+                        4.976029650847191E-006f, 7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+                        -2.485774108720340E+001f, 7.222973196770240E+000f,
+                        -1.544842782180211E+000f, 3.503787691653334E-001f,
+                        -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T YO1 = pset1<T>(4.66539330185668857532f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
+    x_le_two = pmadd(
+       x_le_two, x,
+       pmul(TWOOPI, pmadd(
+           generic_j1<T, float>::run(x), plog(x),
+           pdiv(pset1<T>(-1.0f), x))));
+    x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
+
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 25 term Chebyshev
+     * expansion is used, and a call to j1() is required.
+     * In the second, the asymptotic trigonometric representation
+     * is employed using two rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       8.6e-17     1.3e-17
+     *    IEEE      0, 30       30000       1.0e-15     1.3e-16
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+                         1.12719608129684925192E0, 5.11207951146807644818E0,
+                         8.42404590141772420927E0, 5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+                         1.10514232634061696926E0, 5.07386386128601488557E0,
+                         8.39985554327604159757E0, 5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+                         7.58238284132545283818E1, 3.66779609360150777800E2,
+                         7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2, 2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+                         1.05644886038262816351E3, 4.98641058337653607651E3,
+                         9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double YP[] = {1.26320474790178026440E9, -6.47355876379160291031E11,
+                         1.14509511541823727583E14, -8.12770255501325109621E15,
+                         2.02439475713594898196E17, -7.78877196265950026825E17};
+    const double YQ[] = {1.00000000000000000000E0, 5.94301592346128195359E2,
+                         2.35564092943068577943E5, 7.34811944459721705660E7,
+                         1.87601316108706159478E10, 3.88231277496238566008E12,
+                         6.20557727146953693363E14, 6.87141087355300489866E16,
+                         3.97270608116560655612E18};
+    const T SQ2OPI = pset1<T>(.79788456080286535588);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535); /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP),
+                   internal::ppolevl<T, 8>::run(z, YQ));
+    x_le_five = pmadd(
+        x_le_five, x, pmul(
+            TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x),
+                          pdiv(pset1<T>(-1.0), x))));
+
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(
+        internal::ppolevl<T, 6>::run(s, PP),
+        internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(
+        internal::ppolevl<T, 7>::run(s, QP),
+        internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_THPIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y1_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE T run(const T x) {
+    return generic_y1<T>::run(x);
+  }
+};
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar)
+    bessel_i0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar)
+    bessel_i0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar)
+    bessel_i1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar)
+    bessel_i1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar)
+    bessel_k0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar)
+    bessel_k0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar)
+    bessel_k1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar)
+    bessel_k1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar)
+    bessel_j0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar)
+    bessel_y0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar)
+    bessel_j1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar)
+    bessel_y1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSEL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
new file mode 100644
index 000000000..943d10f6a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0(const Packet& x) {
+  return numext::bessel_i0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0e(const Packet& x) {
+  return numext::bessel_i0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1(const Packet& x) {
+  return numext::bessel_i1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1e(const Packet& x) {
+  return numext::bessel_i1e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j0(const Packet& x) {
+  return numext::bessel_j0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j1(const Packet& x) {
+  return numext::bessel_j1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y0(const Packet& x) {
+  return numext::bessel_y0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y1(const Packet& x) {
+  return numext::bessel_y1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0(const Packet& x) {
+  return numext::bessel_k0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0e(const Packet& x) {
+  return numext::bessel_k0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1(const Packet& x) {
+  return numext::bessel_k1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1e(const Packet& x) {
+  return numext::bessel_k1e(x);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
diff --git a/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h b/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
new file mode 100644
index 000000000..d7b231adb
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
@@ -0,0 +1,67 @@
+#ifndef HIP_VECTOR_COMPATIBILITY_H
+#define HIP_VECTOR_COMPATIBILITY_H
+
+namespace hip_impl {
+  template <typename, typename, unsigned int> struct Scalar_accessor;
+}   // end namespace hip_impl
+
+namespace Eigen {
+namespace internal {
+
+#define HIP_SCALAR_ACCESSOR_BUILDER(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>> : NAME <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_impl <hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl <T> {}; \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_retval <hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME) \
+template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>, mode> : NAME <T, mode> {};
+
+#if EIGEN_HAS_C99_MATH
+HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
+HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
+#endif
+
+HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
+HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // HIP_VECTOR_COMPATIBILITY_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
index ed415db99..691ff4d03 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -24,7 +24,7 @@ namespace Eigen {
   * \sa Eigen::igammac(), Eigen::lgamma()
   */
 template<typename Derived,typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@@ -33,6 +33,48 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
   );
 }
 
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the incomplete
+  * gamma function with respect to the parameter a.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of igamma_der_a(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays.
+  *
+  * This function computes the coefficient-wise derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+  *
+  * \note This function supports only float and double scalar types in c++11
+  * mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations
+  * of gamma_sample_der_alpha(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template <typename AlphaDerived, typename SampleDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
+gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
+      alpha.derived(),
+      sample.derived());
+}
+
 /** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
   *
   * This function computes the coefficient-wise complementary incomplete gamma function.
@@ -44,7 +86,7 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
   * \sa Eigen::igamma(), Eigen::lgamma()
   */
 template<typename Derived,typename ExponentDerived>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@@ -66,7 +108,7 @@ igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDeriv
 // * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
 // * \sa ArrayBase::polygamma()
 template<typename DerivedN,typename DerivedX>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
 polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
@@ -86,7 +128,7 @@ polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>&
   * \sa Eigen::betainc(), Eigen::lgamma()
   */
 template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
-inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
 betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
 {
   return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
@@ -101,7 +143,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
   *
   * It returns the Riemann zeta function of two arguments \a x and \a q:
   *
-  * \param x is the exposent, it must be > 1
+  * \param x is the exponent, it must be > 1
   * \param q is the shift, it must be > 0
   *
   * \note This function supports only float and double scalar types. To support other scalar types, the user has
@@ -110,7 +152,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
   * \sa ArrayBase::zeta()
   */
 template<typename DerivedX,typename DerivedQ>
-inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
 zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
@@ -119,6 +161,7 @@ zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
   );
 }
 
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
new file mode 100644
index 000000000..2d94231f0
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) {
+  return Eigen::bfloat16(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha, const Eigen::bfloat16& sample) {
+  return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
index d8f2363be..abefe99b7 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -41,6 +41,60 @@ struct functor_traits<scalar_igamma_op<Scalar> > {
   };
 };
 
+/** \internal
+  * \brief Template functor to compute the derivative of the incomplete gamma
+  * function igamma_der_a(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma_der_a
+  */
+template <typename Scalar>
+struct scalar_igamma_der_a_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma_der_a;
+    return igamma_der_a(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma_der_a(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma
+    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the derivative of the sample
+  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+  * gamma_sample_der_alpha(alpha, sample)
+  *
+  * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+  */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+    using numext::gamma_sample_der_alpha;
+    return gamma_sample_der_alpha(alpha, sample);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+    return internal::pgamma_sample_der_alpha(alpha, sample);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+  };
+};
 
 /** \internal
   * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
@@ -101,11 +155,11 @@ struct functor_traits<scalar_betainc_op<Scalar> > {
  */
 template<typename Scalar> struct scalar_lgamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
     using numext::lgamma; return lgamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_lgamma_op<Scalar> >
@@ -123,11 +177,11 @@ struct functor_traits<scalar_lgamma_op<Scalar> >
  */
 template<typename Scalar> struct scalar_digamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
     using numext::digamma; return digamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_digamma_op<Scalar> >
@@ -145,11 +199,11 @@ struct functor_traits<scalar_digamma_op<Scalar> >
  */
 template<typename Scalar> struct scalar_zeta_op {
     EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const {
         using numext::zeta; return zeta(x, q);
     }
     typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_zeta_op<Scalar> >
@@ -167,11 +221,11 @@ struct functor_traits<scalar_zeta_op<Scalar> >
  */
 template<typename Scalar> struct scalar_polygamma_op {
     EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const {
         using numext::polygamma; return polygamma(n, x);
     }
     typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_polygamma_op<Scalar> >
@@ -184,25 +238,40 @@ struct functor_traits<scalar_polygamma_op<Scalar> >
 };
 
 /** \internal
- * \brief Template functor to compute the Gauss error function of a
- * scalar
- * \sa class CwiseUnaryOp, Cwise::erf()
+ * \brief Template functor to compute the error function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::erf()
  */
 template<typename Scalar> struct scalar_erf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
-    using numext::erf; return erf(a);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& a) const {
+    return numext::erf(a);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return perf(x);
   }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
 };
-template<typename Scalar>
-struct functor_traits<scalar_erf_op<Scalar> >
-{
+template <typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> > {
   enum {
-    // Guesstimate
-    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasErf
+    PacketAccess = packet_traits<Scalar>::HasErf,
+    Cost =
+        (PacketAccess
+#ifdef EIGEN_VECTORIZE_FMA
+             // TODO(rmlarsen): Move the FMA cost model to a central location.
+             // Haswell can issue 2 add/mul/madd per cycle.
+             // 10 pmadd, 2 pmul, 1 div, 2 other
+             ? (2 * NumTraits<Scalar>::AddCost +
+                7 * NumTraits<Scalar>::MulCost +
+                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+             ? (12 * NumTraits<Scalar>::AddCost +
+                12 * NumTraits<Scalar>::MulCost +
+                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+             // Assume for simplicity that this is as expensive as an exp().
+             : (functor_traits<scalar_exp_op<Scalar> >::Cost))
   };
 };
 
@@ -213,11 +282,11 @@ struct functor_traits<scalar_erf_op<Scalar> >
  */
 template<typename Scalar> struct scalar_erfc_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
     using numext::erfc; return erfc(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erfc_op<Scalar> >
@@ -229,6 +298,31 @@ struct functor_traits<scalar_erfc_op<Scalar> >
   };
 };
 
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template<typename Scalar> struct scalar_ndtri_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+    using numext::ndtri; return ndtri(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> >
+{
+  enum {
+    // On average, We are evaluating rational functions with degree N=9 in the
+    // numerator and denominator. This results in 2*N additions and 2*N
+    // multiplications.
+    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNdtri
+  };
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
index 553bcda6a..2a3a53168 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -30,9 +30,20 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
 }
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) {
+  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
 }
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index f524d7137..f1c260e29 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -36,66 +36,6 @@ namespace internal {
 //    Good luck with your project,
 //    Steve
 
-namespace cephes {
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Scalar, int N>
-struct polevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
-  }
-};
-
-template <typename Scalar>
-struct polevl<Scalar, 0> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
-    return coef[0];
-  }
-};
-
-}  // end namespace cephes
 
 /****************************************************************************
  * Implementation of lgamma, requires C++11/C99                             *
@@ -117,13 +57,27 @@ struct lgamma_retval {
 };
 
 #if EIGEN_HAS_C99_MATH
+// Since glibc 2.19
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 19) || __GLIBC__>2) \
+ && (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+// Glibc versions before 2.19
+#if defined(__GLIBC__) && ((__GLIBC__==2 && __GLIBC_MINOR__ < 19) || __GLIBC__<2) \
+ && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
 template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
-    int signgam;
-    return ::lgammaf_r(x, &signgam);
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined (EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgammaf_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
 #else
     return ::lgammaf(x);
 #endif
@@ -134,14 +88,18 @@ template <>
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
-    int signgam;
-    return ::lgamma_r(x, &signgam);
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgamma_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
 #else
     return ::lgamma(x);
 #endif
   }
 };
+
+#undef EIGEN_HAS_LGAMMA_R
 #endif
 
 /****************************************************************************
@@ -191,7 +149,7 @@ struct digamma_impl_maybe_poly<float> {
     float z;
     if (s < 1.0e8f) {
       z = 1.0f / (s * s);
-      return z * cephes::polevl<float, 3>::run(z, A);
+      return z * internal::ppolevl<float, 3>::run(z, A);
     } else return 0.0f;
   }
 };
@@ -213,7 +171,7 @@ struct digamma_impl_maybe_poly<double> {
     double z;
     if (s < 1.0e17) {
       z = 1.0 / (s * s);
-      return z * cephes::polevl<double, 6>::run(z, A);
+      return z * internal::ppolevl<double, 6>::run(z, A);
     }
     else return 0.0;
   }
@@ -283,7 +241,7 @@ struct digamma_impl {
     Scalar p, q, nz, s, w, y;
     bool negative = false;
 
-    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
     const Scalar m_pi = Scalar(EIGEN_PI);
 
     const Scalar zero = Scalar(0);
@@ -296,7 +254,7 @@ struct digamma_impl {
       q = x;
       p = numext::floor(q);
       if (p == q) {
-        return maxnum;
+        return nan;
       }
       /* Remove the zeros of tan(m_pi x)
        * by subtracting the nearest integer from x
@@ -335,13 +293,63 @@ struct digamma_impl {
  * Implementation of erf, requires C++11/C99                                *
  ****************************************************************************/
 
-template <typename Scalar>
+/** \internal \returns the error function of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 13/8-degree rational interpolant which
+    is accurate up to a couple of ulp in the range [-4, 4], outside of which
+    fl(erf(x)) = +/-1.
+
+    This implementation works on both scalars and Ts.
+*/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
+  // Clamp the inputs to the range [-4, 4] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_4 = pset1<T>(4.f);
+  const T minus_4 = pset1<T>(-4.f);
+  const T x = pmax(pmin(a_x, plus_4), minus_4);
+  // The monomial coefficients of the numerator polynomial (odd).
+  const T alpha_1 = pset1<T>(-1.60960333262415e-02f);
+  const T alpha_3 = pset1<T>(-2.95459980854025e-03f);
+  const T alpha_5 = pset1<T>(-7.34990630326855e-04f);
+  const T alpha_7 = pset1<T>(-5.69250639462346e-05f);
+  const T alpha_9 = pset1<T>(-2.10102402082508e-06f);
+  const T alpha_11 = pset1<T>(2.77068142495902e-08f);
+  const T alpha_13 = pset1<T>(-2.72614225801306e-10f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const T beta_0 = pset1<T>(-1.42647390514189e-02f);
+  const T beta_2 = pset1<T>(-7.37332916720468e-03f);
+  const T beta_4 = pset1<T>(-1.68282697438203e-03f);
+  const T beta_6 = pset1<T>(-2.13374055278905e-04f);
+  const T beta_8 = pset1<T>(-1.45660718464996e-05f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  T p = pmadd(x2, alpha_13, alpha_11);
+  p = pmadd(x2, p, alpha_9);
+  p = pmadd(x2, p, alpha_7);
+  p = pmadd(x2, p, alpha_5);
+  p = pmadd(x2, p, alpha_3);
+  p = pmadd(x2, p, alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = pmadd(x2, beta_8, beta_6);
+  q = pmadd(x2, q, beta_4);
+  q = pmadd(x2, q, beta_2);
+  q = pmadd(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template <typename T>
 struct erf_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return Scalar(0);
+  static EIGEN_STRONG_INLINE T run(const T& x) {
+    return generic_fast_erf_float(x);
   }
 };
 
@@ -354,13 +362,25 @@ struct erf_retval {
 template <>
 struct erf_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+  static EIGEN_STRONG_INLINE float run(float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf_float(x);
+#endif
+  }
 };
 
 template <>
 struct erf_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+  static EIGEN_STRONG_INLINE double run(double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return ::erf(x);
+#endif
+  }
 };
 #endif  // EIGEN_HAS_C99_MATH
 
@@ -387,16 +407,270 @@ struct erfc_retval {
 template <>
 struct erfc_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+  static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return ::erfcf(x);
+#endif
+  }
 };
 
 template <>
 struct erfc_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+  static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return ::erfc(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+
+/***************************************************************************
+* Implementation of ndtri.                                                 *
+****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) );  then the approximation is
+ * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2).  For larger arguments,
+ * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   domain        # trials      peak         rms
+ *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
+ *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
+ *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
+ *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition    value returned
+ * ndtri domain       x <= 0        -MAXNUM
+ * ndtri domain       x >= 1         MAXNUM
+ *
+ */
+ /*
+   Cephes Math Library Release 2.2: June, 1992
+   Copyright 1985, 1987, 1992 by Stephen L. Moshier
+   Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+// TODO: Add a cheaper approximation for float.
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
+    const T& should_flipsign, const T& x) {
+  typedef typename unpacket_traits<T>::type Scalar;
+  const T sign_mask = pset1<T>(Scalar(-0.0));
+  T sign_bit = pand<T>(should_flipsign, sign_mask);
+  return pxor<T>(sign_bit, x);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
+    const double& should_flipsign, const double& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
+    const float& should_flipsign, const float& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+  const ScalarType p0[] = {
+    ScalarType(-5.99633501014107895267e1),
+    ScalarType(9.80010754185999661536e1),
+    ScalarType(-5.66762857469070293439e1),
+    ScalarType(1.39312609387279679503e1),
+    ScalarType(-1.23916583867381258016e0)
+  };
+  const ScalarType q0[] = {
+    ScalarType(1.0),
+    ScalarType(1.95448858338141759834e0),
+    ScalarType(4.67627912898881538453e0),
+    ScalarType(8.63602421390890590575e1),
+    ScalarType(-2.25462687854119370527e2),
+    ScalarType(2.00260212380060660359e2),
+    ScalarType(-8.20372256168333339912e1),
+    ScalarType(1.59056225126211695515e1),
+    ScalarType(-1.18331621121330003142e0)
+  };
+  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+  const T half = pset1<T>(ScalarType(0.5));
+  T c, c2, ndtri_gt_exp_neg_two;
+
+  c = psub(b, half);
+  c2 = pmul(c, c);
+  ndtri_gt_exp_neg_two = pmadd(c, pmul(
+      c2, pdiv(
+          internal::ppolevl<T, 4>::run(c2, p0),
+          internal::ppolevl<T, 8>::run(c2, q0))), c);
+  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
+    const T& b, const T& should_flipsign) {
+  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+   */
+  const ScalarType p1[] = {
+    ScalarType(4.05544892305962419923e0),
+    ScalarType(3.15251094599893866154e1),
+    ScalarType(5.71628192246421288162e1),
+    ScalarType(4.40805073893200834700e1),
+    ScalarType(1.46849561928858024014e1),
+    ScalarType(2.18663306850790267539e0),
+    ScalarType(-1.40256079171354495875e-1),
+    ScalarType(-3.50424626827848203418e-2),
+    ScalarType(-8.57456785154685413611e-4)
+  };
+  const ScalarType q1[] = {
+    ScalarType(1.0),
+    ScalarType(1.57799883256466749731e1),
+    ScalarType(4.53907635128879210584e1),
+    ScalarType(4.13172038254672030440e1),
+    ScalarType(1.50425385692907503408e1),
+    ScalarType(2.50464946208309415979e0),
+    ScalarType(-1.42182922854787788574e-1),
+    ScalarType(-3.80806407691578277194e-2),
+    ScalarType(-9.33259480895457427372e-4)
+  };
+  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+   */
+  const ScalarType p2[] = {
+    ScalarType(3.23774891776946035970e0),
+    ScalarType(6.91522889068984211695e0),
+    ScalarType(3.93881025292474443415e0),
+    ScalarType(1.33303460815807542389e0),
+    ScalarType(2.01485389549179081538e-1),
+    ScalarType(1.23716634817820021358e-2),
+    ScalarType(3.01581553508235416007e-4),
+    ScalarType(2.65806974686737550832e-6),
+    ScalarType(6.23974539184983293730e-9)
+  };
+  const ScalarType q2[] = {
+    ScalarType(1.0),
+    ScalarType(6.02427039364742014255e0),
+    ScalarType(3.67983563856160859403e0),
+    ScalarType(1.37702099489081330271e0),
+    ScalarType(2.16236993594496635890e-1),
+    ScalarType(1.34204006088543189037e-2),
+    ScalarType(3.28014464682127739104e-4),
+    ScalarType(2.89247864745380683936e-6),
+    ScalarType(6.79019408009981274425e-9)
+  };
+  const T eight = pset1<T>(ScalarType(8.0));
+  const T one = pset1<T>(ScalarType(1));
+  const T neg_two = pset1<T>(ScalarType(-2));
+  T x, x0, x1, z;
+
+  x = psqrt(pmul(neg_two, plog(b)));
+  x0 = psub(x, pdiv(plog(x), x));
+  z = pdiv(one, x);
+  x1 = pmul(
+      z, pselect(
+          pcmp_lt(x, eight),
+          pdiv(internal::ppolevl<T, 8>::run(z, p1),
+               internal::ppolevl<T, 8>::run(z, q1)),
+          pdiv(internal::ppolevl<T, 8>::run(z, p2),
+               internal::ppolevl<T, 8>::run(z, q2))));
+  return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T generic_ndtri(const T& a) {
+  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+  const T zero = pset1<T>(ScalarType(0));
+  const T one = pset1<T>(ScalarType(1));
+  // exp(-2)
+  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+  T b, ndtri, should_flipsign;
+
+  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+  b = pselect(should_flipsign, a, psub(one, a));
+
+  ndtri = pselect(
+      pcmp_lt(exp_neg_two, b),
+      generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+      generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+  return pselect(
+      pcmp_le(a, zero), neg_maxnum,
+      pselect(pcmp_le(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+# else
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
+    return generic_ndtri<Scalar, Scalar>(x);
+  }
 };
+
 #endif  // EIGEN_HAS_C99_MATH
 
+
 /**************************************************************************************************************
  * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
  **************************************************************************************************************/
@@ -452,6 +726,228 @@ struct cephes_helper<double> {
   }
 };
 
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC
+static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
+        // Assuming x and a aren't Nan.
+        (numext::isnan)(logax)) {
+      return Scalar(0);
+    }
+    return numext::exp(logax);
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC
+int igamma_num_iterations() {
+  /* Returns the maximum number of internal iterations for igamma computation.
+   */
+  if (mode == VALUE) {
+    return 2000;
+  }
+
+  if (internal::is_same<Scalar, float>::value) {
+    return 200;
+  } else if (internal::is_same<Scalar, double>::value) {
+    return 500;
+  } else {
+    return 2000;
+  }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+  /* Computes igamc(a, x) or derivative (depending on the mode)
+   * using the continued fraction expansion of the complementary
+   * incomplete Gamma function.
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    if ((numext::isinf)(x)) {
+      return zero;
+    }
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evalutes to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    // continued fraction
+    Scalar y = one - a;
+    Scalar z = x + y + one;
+    Scalar c = zero;
+    Scalar pkm2 = one;
+    Scalar qkm2 = x;
+    Scalar pkm1 = x + one;
+    Scalar qkm1 = z * x;
+    Scalar ans = pkm1 / qkm1;
+
+    Scalar dpkm2_da = zero;
+    Scalar dqkm2_da = zero;
+    Scalar dpkm1_da = zero;
+    Scalar dqkm1_da = -x;
+    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      c += one;
+      y += one;
+      z += two;
+
+      Scalar yc = y * c;
+      Scalar pk = pkm1 * z - pkm2 * yc;
+      Scalar qk = qkm1 * z - qkm2 * yc;
+
+      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+      if (qk != zero) {
+        Scalar ans_prev = ans;
+        ans = pk / qk;
+
+        Scalar dans_da_prev = dans_da;
+        dans_da = (dpk_da - ans * dqk_da) / qk;
+
+        if (mode == VALUE) {
+          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+            break;
+          }
+        } else {
+          if (numext::abs(dans_da - dans_da_prev) <= machep) {
+            break;
+          }
+        }
+      }
+
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      dpkm2_da = dpkm1_da;
+      dpkm1_da = dpk_da;
+      dqkm2_da = dqkm1_da;
+      dqkm1_da = dqk_da;
+
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+
+        dpkm2_da *= biginv;
+        dpkm1_da *= biginv;
+        dqkm2_da *= biginv;
+        dqkm1_da *= biginv;
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x;
+    }
+  }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+  /* Computes igam(a, x) or its derivative (depending on the mode)
+   * using the series expansion of the incomplete Gamma function.
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evalutes to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    ax /= a;
+
+    /* power series */
+    Scalar r = a;
+    Scalar c = one;
+    Scalar ans = one;
+
+    Scalar dc_da = zero;
+    Scalar dans_da = zero;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      r += one;
+      Scalar term = x / r;
+      Scalar dterm_da = -x / (r * r);
+      dc_da = term * dc_da + dterm_da * c;
+      dans_da += dc_da;
+      c *= term;
+      ans += c;
+
+      if (mode == VALUE) {
+        if (c <= machep * ans) {
+          break;
+        }
+      } else {
+        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+          break;
+        }
+      }
+    }
+
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default: // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
+  }
+};
+
 #if !EIGEN_HAS_C99_MATH
 
 template <typename Scalar>
@@ -466,8 +962,6 @@ struct igammac_impl {
 
 #else
 
-template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
-
 template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
@@ -535,93 +1029,15 @@ struct igammac_impl {
       return nan;
     }
 
-    if ((x < one) || (x < a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igamma_impl<Scalar>::Impl(a, x));
-    }
-
-    return Impl(a, x);
-  }
-
- private:
-  /* igamma_impl calls igammac_impl::Impl. */
-  friend struct igamma_impl<Scalar>;
-
-  /* Actually computes igamc(a, x).
-   *
-   * Preconditions:
-   *   a > 0
-   *   x >= 1
-   *   x >= a
-   */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar two = 2;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
-    const Scalar big = cephes_helper<Scalar>::big();
-    const Scalar biginv = cephes_helper<Scalar>::biginv();
-    const Scalar inf = NumTraits<Scalar>::infinity();
-
-    Scalar ans, ax, c, yc, r, t, y, z;
-    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
-
-    if (x == inf) return zero;  // std::isinf crashes on CUDA
-
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {  // underflow
-      return zero;
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
     }
-    ax = numext::exp(ax);
 
-    // continued fraction
-    y = one - a;
-    z = x + y + one;
-    c = zero;
-    pkm2 = one;
-    qkm2 = x;
-    pkm1 = x + one;
-    qkm1 = z * x;
-    ans = pkm1 / qkm1;
-
-    while (true) {
-      c += one;
-      y += one;
-      z += two;
-      yc = y * c;
-      pk = pkm1 * z - pkm2 * yc;
-      qk = qkm1 * z - qkm2 * yc;
-      if (qk != zero) {
-        r = pk / qk;
-        t = numext::abs((ans - r) / r);
-        ans = r;
-      } else {
-        t = one;
-      }
-      pkm2 = pkm1;
-      pkm1 = pk;
-      qkm2 = qkm1;
-      qkm1 = qk;
-      if (numext::abs(pk) > big) {
-        pkm2 *= biginv;
-        pkm1 *= biginv;
-        qkm2 *= biginv;
-        qkm1 *= biginv;
-      }
-      if (t <= machep) {
-        break;
-      }
+    if ((x < one) || (x < a)) {
+      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
     }
 
-    return (ans * ax);
+    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
   }
 };
 
@@ -631,15 +1047,10 @@ struct igammac_impl {
  * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
  ************************************************************************************************/
 
-template <typename Scalar>
-struct igamma_retval {
-  typedef Scalar type;
-};
-
 #if !EIGEN_HAS_C99_MATH
 
-template <typename Scalar>
-struct igamma_impl {
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
@@ -650,69 +1061,17 @@ struct igamma_impl {
 
 #else
 
-template <typename Scalar>
-struct igamma_impl {
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /*	igam()
-     *	Incomplete gamma integral
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double a, x, y, igam();
-     *
-     * y = igam( a, x );
-     *
-     * DESCRIPTION:
-     *
-     * The function is defined by
-     *
-     *                           x
-     *                            -
-     *                   1       | |  -t  a-1
-     *  igam(a,x)  =   -----     |   e   t   dt.
-     *                  -      | |
-     *                 | (a)    -
-     *                           0
-     *
-     *
-     * In this implementation both arguments must be positive.
-     * The integral is evaluated by either a power series or
-     * continued fraction expansion, depending on the relative
-     * values of a and x.
-     *
-     * ACCURACY (double):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30       200000       3.6e-14     2.9e-15
-     *    IEEE      0,100      300000       9.9e-14     1.5e-14
-     *
-     *
-     * ACCURACY (float):
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30        20000       7.8e-6      5.9e-7
-     *
-     */
-    /*
-      Cephes Math Library Release 2.2: June, 1992
-      Copyright 1985, 1987, 1992 by Stephen L. Moshier
-      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
-    */
-
-
-    /* left tail of incomplete gamma function:
-     *
-     *          inf.      k
-     *   a  -x   -       x
-     *  x  e     >   ----------
-     *           -     -
-     *          k=0   | (a+k+1)
+    /* Depending on the mode, returns
+     * - VALUE: incomplete Gamma function igamma(a, x)
+     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
      *
+     * Derivatives are implemented by forward-mode differentiation.
      */
     const Scalar zero = 0;
     const Scalar one = 1;
@@ -724,67 +1083,167 @@ struct igamma_impl {
       return nan;
     }
 
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
     if ((x > one) && (x > a)) {
-      /* The checks above ensure that we meet the preconditions for
-       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
-       * Calling Run() would also work, but in that case the compiler may not be
-       * able to prove that igammac_impl::Run and igamma_impl::Run are not
-       * mutually recursive.  This leads to worse code, particularly on
-       * platforms like nvptx, where recursion is allowed only begrudgingly.
-       */
-      return (one - igammac_impl<Scalar>::Impl(a, x));
+      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+      if (mode == VALUE) {
+        return one - ret;
+      } else {
+        return -ret;
+      }
     }
 
-    return Impl(a, x);
+    return igamma_series_impl<Scalar, mode>::run(a, x);
   }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
 
- private:
-  /* igammac_impl calls igamma_impl::Impl. */
-  friend struct igammac_impl<Scalar>;
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
 
-  /* Actually computes igam(a, x).
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+  /* igam()
+   * Incomplete gamma integral.
+   *
+   * The CDF of Gamma(a, 1) random variable at the point x.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 1.26713e-05
+   * double: 2.33606e-12
+   *
+   * Cephes documentation below.
+   *
+   * SYNOPSIS:
+   *
+   * double a, x, y, igam();
+   *
+   * y = igam( a, x );
+   *
+   * DESCRIPTION:
+   *
+   * The function is defined by
+   *
+   *                           x
+   *                            -
+   *                   1       | |  -t  a-1
+   *  igam(a,x)  =   -----     |   e   t   dt.
+   *                  -      | |
+   *                 | (a)    -
+   *                           0
+   *
+   *
+   * In this implementation both arguments must be positive.
+   * The integral is evaluated by either a power series or
+   * continued fraction expansion, depending on the relative
+   * values of a and x.
+   *
+   * ACCURACY (double):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30       200000       3.6e-14     2.9e-15
+   *    IEEE      0,100      300000       9.9e-14     1.5e-14
+   *
+   *
+   * ACCURACY (float):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30        20000       7.8e-6      5.9e-7
    *
-   * Preconditions:
-   *   x > 0
-   *   a > 0
-   *   !(x > 1 && x > a)
    */
-  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+  /*
+    Cephes Math Library Release 2.2: June, 1992
+    Copyright 1985, 1987, 1992 by Stephen L. Moshier
+    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+  */
 
-    Scalar ans, ax, c, r;
+  /* left tail of incomplete gamma function:
+   *
+   *          inf.      k
+   *   a  -x   -       x
+   *  x  e     >   ----------
+   *           -     -
+   *          k=0   | (a+k+1)
+   *
+   */
+};
 
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (ax < -maxlog) {
-      // underflow
-      return zero;
-    }
-    ax = numext::exp(ax);
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
 
-    /* power series */
-    r = a;
-    c = one;
-    ans = one;
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+  /* Derivative of the incomplete Gamma function with respect to a.
+   *
+   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 6.17992e-07
+   * double: 4.60453e-12
+   *
+   * Reference:
+   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+   * integral". Journal of the Royal Statistical Society. 1982
+   */
+};
 
-    while (true) {
-      r += one;
-      c *= x/r;
-      ans += c;
-      if (c/ans <= machep) {
-        break;
-      }
-    }
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
 
-    return (ans * ax / a);
-  }
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl
+    : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+  /* Derivative of a Gamma random variable sample with respect to alpha.
+   *
+   * Consider a sample of a Gamma random variable with the concentration
+   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+   * derivative that we want to compute is dsample / dalpha =
+   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+   * However, this formula is numerically unstable and expensive, so instead
+   * we use implicit differentiation:
+   *
+   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+   * Apply d / dalpha to both sides:
+   * d igamma(alpha, sample) / dalpha
+   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
+   * d igamma(alpha, sample) / dalpha
+   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
+   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+   *                   / Gamma(sample | alpha, 1)
+   *
+   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+   * (note that the derivative of the CDF w.r.t. sample is the PDF).
+   * See the reference below for more details.
+   *
+   * The derivative of igamma(alpha, sample) is computed by forward
+   * differentiation of the igamma code. Division by the Gamma PDF is performed
+   * in the same code, increasing the accuracy and speed due to cancellation
+   * of some terms.
+   *
+   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+   * points. The ground truth is computed by mpmath. Mean absolute error:
+   * float: 2.1686e-06
+   * double: 1.4774e-12
+   *
+   * Reference:
+   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+   * 2018
+   */
 };
 
-#endif  // EIGEN_HAS_C99_MATH
-
 /*****************************************************************************
  * Implementation of Riemann zeta function of two arguments, based on Cephes *
  *****************************************************************************/
@@ -944,7 +1403,12 @@ struct zeta_impl {
         {
             if(q == numext::floor(q))
             {
-                return maxnum;
+                if (x == numext::floor(x) && long(x) % 2 == 0) {
+                    return maxnum;
+                }
+                else {
+                    return nan;
+                }
             }
             p = x;
             r = numext::floor(p);
@@ -1020,11 +1484,11 @@ struct polygamma_impl {
         Scalar nplus = n + one;
         const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
-        // Check that n is an integer
-        if (numext::floor(n) != n) {
+        // Check that n is a non-negative integer
+        if (numext::floor(n) != n || n < zero) {
             return nan;
         }
-        // Just return the digamma function for n = 1
+        // Just return the digamma function for n = 0
         else if (n == zero) {
             return digamma_impl<Scalar>::run(x);
         }
@@ -1392,7 +1856,7 @@ struct betainc_helper<double> {
     if ((a + b) < maxgam && numext::abs(u) < maxlog) {
       t = gamma(a + b) / (gamma(a) * gamma(b));
       s = s * t * pow(x, a);
-    } else {
+    }
     */
     t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
         lgamma_impl<double>::run(b) + u + numext::log(s);
@@ -1540,12 +2004,30 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
 }
 
 template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
+    ndtri(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
+template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
     igamma(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
 }
 
 template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar)
+    igamma_der_a(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
     igammac(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
@@ -1558,8 +2040,6 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
 }
 
 }  // end namespace numext
-
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
index 46d60d323..2bb017921 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -38,10 +38,32 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pndtri(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
+}
+
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
 
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+  using numext::igamma_der_a; return igamma_der_a(a, x);
+}
+
+/** \internal \returns compute the derivative of the sample
+  * of Gamma(alpha, 1) random variable with respect to the parameter a
+  * gamma_sample_der_alpha(\a alpha, \a sample) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+  using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample);
+}
+
 /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
@@ -55,4 +77,3 @@ Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
-
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 000000000..2d7669209
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_BESSELFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 000000000..35e62a8ac
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 000000000..7dd3c3e5b
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_BESSELFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 000000000..79878f2b6
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
deleted file mode 100644
index ec4fa8448..000000000
--- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
+++ /dev/null
@@ -1,165 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
-#define EIGEN_CUDA_SPECIALFUNCTIONS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plgamma<float4>(const float4& a)
-{
-  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plgamma<double2>(const double2& a)
-{
-  using numext::lgamma;
-  return make_double2(lgamma(a.x), lgamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pdigamma<float4>(const float4& a)
-{
-  using numext::digamma;
-  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pdigamma<double2>(const double2& a)
-{
-  using numext::digamma;
-  return make_double2(digamma(a.x), digamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pzeta<float4>(const float4& x, const float4& q)
-{
-    using numext::zeta;
-    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pzeta<double2>(const double2& x, const double2& q)
-{
-    using numext::zeta;
-    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 ppolygamma<float4>(const float4& n, const float4& x)
-{
-    using numext::polygamma;
-    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 ppolygamma<double2>(const double2& n, const double2& x)
-{
-    using numext::polygamma;
-    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perf<float4>(const float4& a)
-{
-  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perf<double2>(const double2& a)
-{
-  using numext::erf;
-  return make_double2(erf(a.x), erf(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perfc<float4>(const float4& a)
-{
-  using numext::erfc;
-  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perfc<double2>(const double2& a)
-{
-  using numext::erfc;
-  return make_double2(erfc(a.x), erfc(a.y));
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigamma<float4>(const float4& a, const float4& x)
-{
-  using numext::igamma;
-  return make_float4(
-      igamma(a.x, x.x),
-      igamma(a.y, x.y),
-      igamma(a.z, x.z),
-      igamma(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigamma<double2>(const double2& a, const double2& x)
-{
-  using numext::igamma;
-  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigammac<float4>(const float4& a, const float4& x)
-{
-  using numext::igammac;
-  return make_float4(
-      igammac(a.x, x.x),
-      igammac(a.y, x.y),
-      igammac(a.z, x.z),
-      igammac(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigammac<double2>(const double2& a, const double2& x)
-{
-  using numext::igammac;
-  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
-{
-  using numext::betainc;
-  return make_float4(
-      betainc(a.x, b.x, x.x),
-      betainc(a.y, b.y, x.y),
-      betainc(a.z, b.z, x.z),
-      betainc(a.w, b.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
-{
-  using numext::betainc;
-  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
new file mode 100644
index 000000000..dd3bf4dd1
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
@@ -0,0 +1,369 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pndtri<float4>(const float4& a)
+{
+  using numext::ndtri;
+  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pndtri<double2>(const double2& a)
+{
+  using numext::ndtri;
+  return make_double2(ndtri(a.x), ndtri(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(
+    const float4& a, const float4& x) {
+  using numext::igamma_der_a;
+  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y),
+                     igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pigamma_der_a<double2>(const double2& a, const double2& x) {
+  using numext::igamma_der_a;
+  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(
+    const float4& alpha, const float4& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_float4(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y),
+      gamma_sample_der_alpha(alpha.z, sample.z),
+      gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pgamma_sample_der_alpha<double2>(const double2& alpha, const double2& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_double2(
+      gamma_sample_der_alpha(alpha.x, sample.x),
+      gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
+  using numext::bessel_i0e;
+  return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0e<double2>(const double2& x) {
+  using numext::bessel_i0e;
+  return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
+  using numext::bessel_i0;
+  return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0<double2>(const double2& x) {
+  using numext::bessel_i0;
+  return make_double2(bessel_i0(x.x), bessel_i0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
+  using numext::bessel_i1e;
+  return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1e<double2>(const double2& x) {
+  using numext::bessel_i1e;
+  return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
+  using numext::bessel_i1;
+  return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1<double2>(const double2& x) {
+  using numext::bessel_i1;
+  return make_double2(bessel_i1(x.x), bessel_i1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
+  using numext::bessel_k0e;
+  return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0e<double2>(const double2& x) {
+  using numext::bessel_k0e;
+  return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
+  using numext::bessel_k0;
+  return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0<double2>(const double2& x) {
+  using numext::bessel_k0;
+  return make_double2(bessel_k0(x.x), bessel_k0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
+  using numext::bessel_k1e;
+  return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1e<double2>(const double2& x) {
+  using numext::bessel_k1e;
+  return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
+  using numext::bessel_k1;
+  return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1<double2>(const double2& x) {
+  using numext::bessel_k1;
+  return make_double2(bessel_k1(x.x), bessel_k1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
+  using numext::bessel_j0;
+  return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j0<double2>(const double2& x) {
+  using numext::bessel_j0;
+  return make_double2(bessel_j0(x.x), bessel_j0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
+  using numext::bessel_j1;
+  return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j1<double2>(const double2& x) {
+  using numext::bessel_j1;
+  return make_double2(bessel_j1(x.x), bessel_j1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
+  using numext::bessel_y0;
+  return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y0<double2>(const double2& x) {
+  using numext::bessel_y0;
+  return make_double2(bessel_y0(x.x), bessel_y0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
+  using numext::bessel_y1;
+  return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y1<double2>(const double2& x) {
+  using numext::bessel_y1;
+  return make_double2(bessel_y1(x.x), bessel_y1(x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 000000000..67433b057
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                            \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) {                       \
+  const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));  \
+  const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+  return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));              \
+}                                                                       \
+                                                                        \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) {                       \
+  return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));               \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_BESSELFUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 000000000..ec9295197
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                            \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) {                       \
+  const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));  \
+  const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+  return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));              \
+}                                                                       \
+                                                                        \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                       \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) {                       \
+  return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));               \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index 627f6e482..79edd52ce 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -191,7 +191,7 @@ namespace Eigen
     DenseIndex span(Scalar u) const;
 
     /**
-     * \brief Computes the spang within the provided knot vector in which u is falling.
+     * \brief Computes the span within the provided knot vector in which u is falling.
      **/
     static DenseIndex Span(typename SplineTraits<Spline>::Scalar u, DenseIndex degree, const typename SplineTraits<Spline>::KnotVectorType& knots);
     
@@ -249,15 +249,13 @@ namespace Eigen
     DenseIndex degree,
     const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
   {
-    typedef typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType BasisVectorType;
-
     const DenseIndex p = degree;
     const DenseIndex i = Spline::Span(u, degree, knots);
 
     const KnotVectorType& U = knots;
 
     BasisVectorType left(p+1); left(0) = Scalar(0);
-    BasisVectorType right(p+1); right(0) = Scalar(0);        
+    BasisVectorType right(p+1); right(0) = Scalar(0);
 
     VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
     VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
@@ -380,9 +378,6 @@ namespace Eigen
     typedef Spline<_Scalar, _Dim, _Degree> SplineType;
     enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
 
-    typedef typename SplineTraits<SplineType>::Scalar Scalar;
-    typedef typename SplineTraits<SplineType>::BasisVectorType BasisVectorType;
-  
     const DenseIndex span = SplineType::Span(u, p, U);
 
     const DenseIndex n = (std::min)(p, order);
diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
index c761a9b3d..9f6e8afa0 100644
--- a/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -17,8 +17,8 @@
 
 #include "SplineFwd.h"
 
-#include <Eigen/LU>
-#include <Eigen/QR>
+#include "../../../../Eigen/LU"
+#include "../../../../Eigen/QR"
 
 namespace Eigen
 {
@@ -181,7 +181,7 @@ namespace Eigen
    * \ingroup Splines_Module
    *
    * \param[in] pts The data points to which a spline should be fit.
-   * \param[out] chord_lengths The resulting chord lenggth vector.
+   * \param[out] chord_lengths The resulting chord length vector.
    *
    * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
    **/   
@@ -385,7 +385,7 @@ namespace Eigen
     {
       const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
 
-      if (derivativeIndices[derivativeIndex] == i)
+      if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i)
       {
         A.block(row, span - degree, 2, degree + 1)
           = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
@@ -395,8 +395,9 @@ namespace Eigen
       }
       else
       {
-        A.row(row++).segment(span - degree, degree + 1)
+        A.row(row).segment(span - degree, degree + 1)
           = SplineType::BasisFunctions(parameters[i], degree, knots);
+        b.col(row++) = points.col(i);
       }
     }
     b.col(0) = points.col(0);
diff --git a/unsupported/Eigen/src/Splines/SplineFwd.h b/unsupported/Eigen/src/Splines/SplineFwd.h
index 0a95fbf3e..00d6b4921 100644
--- a/unsupported/Eigen/src/Splines/SplineFwd.h
+++ b/unsupported/Eigen/src/Splines/SplineFwd.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SPLINES_FWD_H
 #define EIGEN_SPLINES_FWD_H
 
-#include <Eigen/Core>
+#include "../../../../Eigen/Core"
 
 namespace Eigen
 {
diff --git a/unsupported/README.txt b/unsupported/README.txt
index 83479ff0b..70793bf13 100644
--- a/unsupported/README.txt
+++ b/unsupported/README.txt
@@ -20,7 +20,7 @@ However, it:
  - must rely on Eigen,
  - must be highly related to math,
  - should have some general purpose in the sense that it could
-   potentially become an offical Eigen module (or be merged into another one).
+   potentially become an official Eigen module (or be merged into another one).
 
 In doubt feel free to contact us. For instance, if your addons is very too specific
 but it shows an interesting way of using Eigen, then it could be a nice demo.
diff --git a/unsupported/bench/bench_svd.cpp b/unsupported/bench/bench_svd.cpp
index 01d8231ae..e7028a2b9 100644
--- a/unsupported/bench/bench_svd.cpp
+++ b/unsupported/bench/bench_svd.cpp
@@ -70,7 +70,7 @@ void bench_svd(const MatrixType& a = MatrixType())
   std::cout<< std::endl;
   timerJacobi.reset();
   timerBDC.reset();
-  cout << " Computes rotaion matrix" <<endl;
+  cout << " Computes rotation matrix" <<endl;
   for (int k=1; k<=NUMBER_SAMPLE; ++k)
   {
     timerBDC.start();
diff --git a/unsupported/doc/Overview.dox b/unsupported/doc/Overview.dox
index 45464a545..bae51dcf6 100644
--- a/unsupported/doc/Overview.dox
+++ b/unsupported/doc/Overview.dox
@@ -11,6 +11,8 @@ Click on the \e Modules tab at the top of this page to get a list of all unsuppo
 
 Don't miss the <a href="../index.html">official Eigen documentation</a>.
 
+ \subpage SYCL_EIGEN "SYCL backend for Eigen"
+
 */
 
 /*
@@ -26,3 +28,4 @@ subject to be included in %Eigen in the future.
 /// \internal \brief Namespace containing low-level routines from the %Eigen library.
 namespace internal {}
 }
+
diff --git a/unsupported/doc/SYCL.dox b/unsupported/doc/SYCL.dox
new file mode 100644
index 000000000..2295adf21
--- /dev/null
+++ b/unsupported/doc/SYCL.dox
@@ -0,0 +1,9 @@
+/** \page SYCL_EIGEN Eigen SYCL Backend
+
+Useful information for Eigen SYCL Backend:
+
+- <a href="https://developer.codeplay.com/computecppce/latest/getting-started-with-eigen">Getting Started with Eigen</a> 
+
+- <a href="https://developer.codeplay.com/computecppce/latest/options-for-building-eigen-sycl">Options for Building Eigen SYCL</a>  
+
+*/
diff --git a/unsupported/doc/examples/CMakeLists.txt b/unsupported/doc/examples/CMakeLists.txt
index c47646dfc..7bb67736c 100644
--- a/unsupported/doc/examples/CMakeLists.txt
+++ b/unsupported/doc/examples/CMakeLists.txt
@@ -1,20 +1,24 @@
-FILE(GLOB examples_SRCS "*.cpp")
+file(GLOB examples_SRCS "*.cpp")
 
-ADD_CUSTOM_TARGET(unsupported_examples)
+add_custom_target(unsupported_examples)
 
-INCLUDE_DIRECTORIES(../../../unsupported ../../../unsupported/test)
+include_directories(../../../unsupported ../../../unsupported/test)
 
-FOREACH(example_src ${examples_SRCS})
-  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
-  ADD_EXECUTABLE(example_${example} ${example_src})
+foreach(example_src ${examples_SRCS})
+  get_filename_component(example ${example_src} NAME_WE)
+  add_executable(example_${example} ${example_src})
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
     target_link_libraries(example_${example} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
   endif()
-  ADD_CUSTOM_COMMAND(
+  add_custom_command(
     TARGET example_${example}
     POST_BUILD
     COMMAND example_${example}
     ARGS >${CMAKE_CURRENT_BINARY_DIR}/${example}.out
   )
-  ADD_DEPENDENCIES(unsupported_examples example_${example})
-ENDFOREACH(example_src)
+  add_dependencies(unsupported_examples example_${example})
+endforeach(example_src)
+
+if(EIGEN_TEST_SYCL)
+  add_subdirectory(SYCL)
+endif(EIGEN_TEST_SYCL)
diff --git a/unsupported/doc/examples/EulerAngles.cpp b/unsupported/doc/examples/EulerAngles.cpp
index 1ef6aee18..3f8ca8c17 100644
--- a/unsupported/doc/examples/EulerAngles.cpp
+++ b/unsupported/doc/examples/EulerAngles.cpp
@@ -23,7 +23,7 @@ int main()
   // Some Euler angles representation that our plane use.
   EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
   
-  MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
+  MyArmyAngles planeAnglesInMyArmyAngles(planeAngles);
   
   std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
   std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
@@ -37,7 +37,7 @@ int main()
   Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
   
   planeAngles = planeRotated;
-  planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
+  planeAnglesInMyArmyAngles = planeRotated;
   
   std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
   std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
diff --git a/unsupported/doc/examples/FFT.cpp b/unsupported/doc/examples/FFT.cpp
index fcbf81276..85e8a0241 100644
--- a/unsupported/doc/examples/FFT.cpp
+++ b/unsupported/doc/examples/FFT.cpp
@@ -61,14 +61,14 @@ template <typename T>
 void RandomFill(std::vector<T> & vec)
 {
     for (size_t k=0;k<vec.size();++k)
-        vec[k] = T( rand() )/T(RAND_MAX) - .5;
+        vec[k] = T( rand() )/T(RAND_MAX) - T(.5);
 }
 
 template <typename T>
 void RandomFill(std::vector<std::complex<T> > & vec)
 {
     for (size_t k=0;k<vec.size();++k)
-        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - .5, T( rand() )/T(RAND_MAX) - .5);
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - T(.5), T( rand() )/T(RAND_MAX) - T(.5));
 }
 
 template <typename T_time,typename T_freq>
@@ -85,7 +85,7 @@ void fwd_inv(size_t nfft)
     vector<T_time> timebuf2;
     fft.inv(timebuf2,freqbuf);
 
-    long double rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    T_time rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
     cout << "roundtrip rmse: " << rmse << endl;
 }
 
diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
new file mode 100644
index 000000000..1d0f721dc
--- /dev/null
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@@ -0,0 +1,37 @@
+FILE(GLOB examples_SRCS "*.cpp")
+
+set(EIGEN_SYCL ON)
+list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
+if(EIGEN_SYCL_TRISYCL)
+  set(CMAKE_CXX_STANDARD 17)
+else(EIGEN_SYCL_TRISYCL)
+  if(MSVC)
+    # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+    # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+    set(CMAKE_CXX_STANDARD 14)
+    list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+  else()
+    set(CMAKE_CXX_STANDARD 11)
+    list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+  endif()
+  # The following flags are not supported by Clang and can cause warnings
+  # if used with -Werror so they are removed here.
+  if(COMPUTECPP_USE_COMPILER_DRIVER)
+    set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+    string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+    string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  endif()
+  list(APPEND COMPUTECPP_USER_FLAGS
+      -DEIGEN_NO_ASSERTION_CHECKING=1
+      -no-serial-memop
+      -Xclang
+      -cl-mad-enable)
+endif(EIGEN_SYCL_TRISYCL)
+
+FOREACH(example_src ${examples_SRCS})
+  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
+  ei_add_test_internal(${example} example_${example})
+  ADD_DEPENDENCIES(unsupported_examples example_${example})
+ENDFOREACH(example_src)
+set(EIGEN_SYCL OFF)
diff --git a/unsupported/doc/examples/SYCL/CwiseMul.cpp b/unsupported/doc/examples/SYCL/CwiseMul.cpp
new file mode 100644
index 000000000..a7c33140e
--- /dev/null
+++ b/unsupported/doc/examples/SYCL/CwiseMul.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#define EIGEN_USE_SYCL
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+int main()
+{
+  using DataType = float;
+  using IndexType = int64_t;
+  constexpr auto DataLayout = Eigen::RowMajor;
+
+  auto devices = Eigen::get_sycl_supported_devices();
+  const auto device_selector = *devices.begin();
+  Eigen::QueueInterface queueInterface(device_selector);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  
+  // create the tensors to be used in the operation
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 3;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  // initialize the tensors with the data we want manipulate to
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
+
+  // set up some random data in the tensors to be multiplied
+  in1 = in1.random();
+  in2 = in2.random();
+
+  // allocate memory for the tensors
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  // 
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+
+  // copy the memory to the device and do the c=a*b calculation
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  // print out the results
+   for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        std::cout << "device_out" << "(" << i << ", " << j << ", " << k << ") : " << out(i,j,k) 
+                  << " vs host_out" << "(" << i << ", " << j << ", " << k << ") : " << in1(i,j,k) * in2(i,j,k) << "\n";
+      }
+    }
+  }
+  printf("c=a*b Done\n");
+}
diff --git a/unsupported/doc/snippets/CMakeLists.txt b/unsupported/doc/snippets/CMakeLists.txt
index f0c5cc2a8..adf95a8db 100644
--- a/unsupported/doc/snippets/CMakeLists.txt
+++ b/unsupported/doc/snippets/CMakeLists.txt
@@ -1,26 +1,26 @@
-FILE(GLOB snippets_SRCS "*.cpp")
+file(GLOB snippets_SRCS "*.cpp")
 
-ADD_CUSTOM_TARGET(unsupported_snippets)
+add_custom_target(unsupported_snippets)
 
-FOREACH(snippet_src ${snippets_SRCS})
-  GET_FILENAME_COMPONENT(snippet ${snippet_src} NAME_WE)
-  SET(compile_snippet_target compile_${snippet})
-  SET(compile_snippet_src ${compile_snippet_target}.cpp)
-  FILE(READ ${snippet_src} snippet_source_code)
-  CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/snippets/compile_snippet.cpp.in
+foreach(snippet_src ${snippets_SRCS})
+  get_filename_component(snippet ${snippet_src} NAME_WE)
+  set(compile_snippet_target compile_${snippet})
+  set(compile_snippet_src ${compile_snippet_target}.cpp)
+  file(READ ${snippet_src} snippet_source_code)
+  configure_file(${PROJECT_SOURCE_DIR}/doc/snippets/compile_snippet.cpp.in
                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
-  ADD_EXECUTABLE(${compile_snippet_target}
+  add_executable(${compile_snippet_target}
                  ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
     target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
   endif()
-  ADD_CUSTOM_COMMAND(
+  add_custom_command(
     TARGET ${compile_snippet_target}
     POST_BUILD
     COMMAND ${compile_snippet_target}
     ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
   )
-  ADD_DEPENDENCIES(unsupported_snippets ${compile_snippet_target})
+  add_dependencies(unsupported_snippets ${compile_snippet_target})
   set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
                               PROPERTIES OBJECT_DEPENDS ${snippet_src})
-ENDFOREACH(snippet_src)
+endforeach(snippet_src)
diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp
index ff5b3299d..d8c39d556 100644
--- a/unsupported/test/BVH.cpp
+++ b/unsupported/test/BVH.cpp
@@ -192,7 +192,7 @@ struct TreeTest
 };
 
 
-void test_BVH()
+EIGEN_DECLARE_TEST(BVH)
 {
   for(int i = 0; i < g_repeat; i++) {
 #ifdef EIGEN_TEST_PART_1
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index b5fa1c845..d30fa62bd 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,16 +1,7 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-  foreach(i RANGE 1 999)
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-      "#ifdef EIGEN_TEST_PART_${i}\n"
-      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-      "#else\n"
-      "#define CALL_SUBTEST_${i}(FUNC)\n"
-      "#endif\n\n"
-    )
-  endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
 endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
@@ -22,22 +13,27 @@ include_directories(../../test ../../unsupported ../../Eigen
 find_package (Threads)
 
 find_package(GoogleHash)
-if(GOOGLEHASH_FOUND)
+if(GoogleHash_FOUND)
   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
   include_directories(${GOOGLEHASH_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "GoogleHash, ")
-else(GOOGLEHASH_FOUND)
+else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "GoogleHash, ")
-endif(GOOGLEHASH_FOUND)
+endif()
+
 
 find_package(Adolc)
-if(ADOLC_FOUND)
+if(Adolc_FOUND)
   include_directories(${ADOLC_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
-  ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
-else(ADOLC_FOUND)
+  if(EIGEN_TEST_CXX11)
+    ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+  else()
+    message(STATUS "Adolc found, but tests require C++11 mode")
+  endif()
+else()
   ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
-endif(ADOLC_FOUND)
+endif()
 
 # this test seems to never have been successful on x87, so is considered to contain a FP-related bug.
 # see thread: "non-linear optimization test summary"
@@ -47,9 +43,7 @@ ei_add_test(NumericalDiff)
 ei_add_test(autodiff_scalar)
 ei_add_test(autodiff)
 
-if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
 ei_add_test(BVH)
-endif()
 
 ei_add_test(matrix_exponential)
 ei_add_test(matrix_function)
@@ -61,13 +55,11 @@ ei_add_test(FFT)
 
 ei_add_test(EulerAngles)
 
-find_package(MPFR 2.3.0)
-find_package(GMP)
-if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
-  include_directories(${MPFR_INCLUDES} ./mpreal)
+find_package(MPREAL)
+if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
-  set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
- ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+  include_directories(${MPREAL_INCLUDES})
+  ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -87,8 +79,8 @@ else()
   ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
 endif()
 
-option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF)
-if(NOT EIGEN_TEST_NO_OPENGL)
+option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
+if(EIGEN_TEST_OPENGL)
   find_package(OpenGL)
   find_package(GLUT)
   find_package(GLEW)
@@ -108,89 +100,192 @@ ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
 ei_add_test(splines)
 ei_add_test(gmres)
+ei_add_test(dgmres)
 ei_add_test(minres)
+ei_add_test(idrs)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
+ei_add_test(bessel_functions)
 ei_add_test(special_functions)
-
-# TODO: The following test names are prefixed with the cxx11 string, since historically
-# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
-# when using visual studio. We should make the check more strict to enable the tests for
-# newer versions of MSVC.
-if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-ei_add_test(cxx11_tensor_dimension)
-ei_add_test(cxx11_tensor_map)
-ei_add_test(cxx11_tensor_assign)
-ei_add_test(cxx11_tensor_comparisons)
-ei_add_test(cxx11_tensor_forced_eval)
-ei_add_test(cxx11_tensor_math)
-ei_add_test(cxx11_tensor_const)
-ei_add_test(cxx11_tensor_intdiv)
-ei_add_test(cxx11_tensor_casts)
-ei_add_test(cxx11_tensor_empty)
-ei_add_test(cxx11_tensor_sugar)
-ei_add_test(cxx11_tensor_roundings)
-ei_add_test(cxx11_tensor_layout_swap)
-ei_add_test(cxx11_tensor_io)
-if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-  # This test requires __uint128_t which is only available on 64bit systems
-  ei_add_test(cxx11_tensor_uint128)
-endif()
-endif()
+ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
 
 if(EIGEN_TEST_CXX11)
   if(EIGEN_TEST_SYCL)
-    ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
-  endif(EIGEN_TEST_SYCL)
-  # It should be safe to always run these tests as there is some fallback code for
-  # older compiler that don't support cxx11.
-  set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_SYCL ON)
+    # Forward CMake options as preprocessor definitions
+    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+    endif()
+    if(EIGEN_SYCL_NO_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+    endif()
+    if(EIGEN_SYCL_REG_M)
+      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+    endif()
+    if(EIGEN_SYCL_REG_N)
+      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+    endif()
+    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+    endif()
+    if(EIGEN_SYCL_ASYNC_EXECUTION)
+      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SKINNY)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+    endif()
+    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+  endif()
+    if(EIGEN_SYCL_DISABLE_RANK1)
+      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SCALAR)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+    endif()
+    if(EIGEN_SYCL_DISABLE_GEMV)
+      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+    endif()
+    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+    endif()
+
+    if(EIGEN_SYCL_TRISYCL)
+      # triSYCL now requires c++17.
+      set(CMAKE_CXX_STANDARD 17)
+    else()
+      if(MSVC)
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+        set(CMAKE_CXX_STANDARD 14)
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+      else()
+        set(CMAKE_CXX_STANDARD 11)
+        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+      endif()
+      # The following flags are not supported by Clang and can cause warnings
+      # if used with -Werror so they are removed here.
+      if(COMPUTECPP_USE_COMPILER_DRIVER)
+        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      endif()
+      list(APPEND COMPUTECPP_USER_FLAGS
+          -DEIGEN_NO_ASSERTION_CHECKING=1
+          -no-serial-memop
+          -Xclang
+          -cl-mad-enable)
+    endif()
+
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+    set(EIGEN_SYCL OFF)
+  endif()
 
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 
   ei_add_test(cxx11_meta)
-  ei_add_test(cxx11_tensor_simple)
-#  ei_add_test(cxx11_tensor_symmetry)
-  ei_add_test(cxx11_tensor_index_list)
-  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_maxsizevector)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_assign)
+  ei_add_test(cxx11_tensor_block_access)
+  ei_add_test(cxx11_tensor_block_eval)
+  ei_add_test(cxx11_tensor_block_io)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_casts)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_comparisons)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_const)
   ei_add_test(cxx11_tensor_contraction)
   ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_dimension)
+  ei_add_test(cxx11_tensor_empty)
+  ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_fft)
   ei_add_test(cxx11_tensor_fixed_size)
-  ei_add_test(cxx11_tensor_of_const_values)
-  ei_add_test(cxx11_tensor_of_complex)
-  ei_add_test(cxx11_tensor_of_strings)
-  ei_add_test(cxx11_tensor_lvalue)
-  ei_add_test(cxx11_tensor_broadcasting)
-  ei_add_test(cxx11_tensor_chipping)
-  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_forced_eval)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_index_list)
   ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_intdiv)
+  ei_add_test(cxx11_tensor_io)
+  ei_add_test(cxx11_tensor_layout_swap)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_map)
+  ei_add_test(cxx11_tensor_math)
+  ei_add_test(cxx11_tensor_mixed_indices)
   ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_move)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_strings)
   ei_add_test(cxx11_tensor_padding)
   ei_add_test(cxx11_tensor_patch)
-  ei_add_test(cxx11_tensor_image_patch)
-  ei_add_test(cxx11_tensor_volume_patch)
+  ei_add_test(cxx11_tensor_random)
   ei_add_test(cxx11_tensor_reduction)
-  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_roundings)
+  ei_add_test(cxx11_tensor_scan)
   ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_simple)
   ei_add_test(cxx11_tensor_striding)
-  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_sugar)
+  ei_add_test(cxx11_tensor_thread_local "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
-  ei_add_test(cxx11_tensor_ref)
-  ei_add_test(cxx11_tensor_random)
-  ei_add_test(cxx11_tensor_generator)
-  ei_add_test(cxx11_tensor_custom_op)
-  ei_add_test(cxx11_tensor_custom_index)
-  ei_add_test(cxx11_tensor_fft)
-  ei_add_test(cxx11_tensor_ifft)
-  ei_add_test(cxx11_tensor_scan)
+  ei_add_test(cxx11_tensor_trace)
+  ei_add_test(cxx11_tensor_volume_patch)
+#  ei_add_test(cxx11_tensor_symmetry)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
 
 endif()
 
@@ -213,7 +308,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
-   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+    foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+        string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}")
+    endforeach()
   endif()
 
   set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
@@ -221,37 +320,98 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
   endif()
 
-  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
-    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
-  else()
-    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
-    set(EIGEN_CUDA_CXX11_FLAG "")
-  endif()
-
-  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
+  set(NVCC_ARCH_FLAGS)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+  endforeach()
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
-  ei_add_test(cxx11_tensor_complex_cuda)
-  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
-  ei_add_test(cxx11_tensor_reduction_cuda)
-  ei_add_test(cxx11_tensor_argmax_cuda)
-  ei_add_test(cxx11_tensor_cast_float16_cuda)
-  ei_add_test(cxx11_tensor_scan_cuda)
+  ei_add_test(cxx11_tensor_complex_gpu)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+  ei_add_test(cxx11_tensor_reduction_gpu)
+  ei_add_test(cxx11_tensor_argmax_gpu)
+  ei_add_test(cxx11_tensor_cast_float16_gpu)
+  ei_add_test(cxx11_tensor_scan_gpu)
+
+  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
+      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
+    endif()
+  endforeach()
 
   # Contractions require arch 3.0 or higher
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
     ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_cuda)
-    ei_add_test(cxx11_tensor_contract_cuda)
-    ei_add_test(cxx11_tensor_of_float16_cuda)
+    ei_add_test(cxx11_tensor_gpu)
+    ei_add_test(cxx11_tensor_contract_gpu)
+    ei_add_test(cxx11_tensor_of_float16_gpu)
   endif()
 
   # The random number generation code requires arch 3.5 or greater.
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_cuda)
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_gpu)
   endif()
 
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	#
+	# complex datatype is not yet supported by HIP
+	# so leaving out those tests for now
+	#
+	# ei_add_test(cxx11_tensor_complex_gpu)
+	# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+	#
+	ei_add_test(cxx11_tensor_reduction_gpu)
+	ei_add_test(cxx11_tensor_argmax_gpu)
+	ei_add_test(cxx11_tensor_cast_float16_gpu)
+	ei_add_test(cxx11_tensor_scan_gpu)
+	ei_add_test(cxx11_tensor_device)
+
+	ei_add_test(cxx11_tensor_gpu)
+	ei_add_test(cxx11_tensor_contract_gpu)
+	ei_add_test(cxx11_tensor_of_float16_gpu)
+	ei_add_test(cxx11_tensor_random_gpu)
+
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+      elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+
+    endif()
+
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+  endif()
+
+endif()
+
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
index a8cb52864..0955795b6 100644
--- a/unsupported/test/EulerAngles.cpp
+++ b/unsupported/test/EulerAngles.cpp
@@ -13,146 +13,220 @@
 
 using namespace Eigen;
 
-template<typename EulerSystem, typename Scalar>
-void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
-  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework)
+template <typename Scalar, class System>
+bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b)
+{
+  return verifyIsApprox(a.angles(), b.angles());
+}
+
+// Verify that x is in the approxed range [a, b]
+#define VERIFY_APPROXED_RANGE(a, x, b) \
+  do { \
+  VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \
+  VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \
+  } while(0)
+
+const char X = EULER_X;
+const char Y = EULER_Y;
+const char Z = EULER_Z;
+
+template<typename Scalar, class EulerSystem>
+void verify_euler(const EulerAngles<Scalar, EulerSystem>& e)
 {
   typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
   typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,3,1> Vector3;
   typedef Quaternion<Scalar> QuaternionType;
   typedef AngleAxis<Scalar> AngleAxisType;
-  using std::abs;
-  
-  Scalar alphaRangeStart, alphaRangeEnd;
-  Scalar betaRangeStart, betaRangeEnd;
-  Scalar gammaRangeStart, gammaRangeEnd;
   
-  if (positiveRangeAlpha)
-  {
-    alphaRangeStart = Scalar(0);
-    alphaRangeEnd = Scalar(2 * EIGEN_PI);
-  }
-  else
-  {
-    alphaRangeStart = -Scalar(EIGEN_PI);
-    alphaRangeEnd = Scalar(EIGEN_PI);
-  }
+  const Scalar ONE = Scalar(1);
+  const Scalar HALF_PI = Scalar(EIGEN_PI / 2);
+  const Scalar PI = Scalar(EIGEN_PI);
   
-  if (positiveRangeBeta)
-  {
-    betaRangeStart = Scalar(0);
-    betaRangeEnd = Scalar(2 * EIGEN_PI);
-  }
-  else
-  {
-    betaRangeStart = -Scalar(EIGEN_PI);
-    betaRangeEnd = Scalar(EIGEN_PI);
-  }
+  // It's very important calc the acceptable precision depending on the distance from the pole.
+  const Scalar longitudeRadius = std::abs(
+    EulerSystem::IsTaitBryan ?
+    std::cos(e.beta()) :
+    std::sin(e.beta())
+    );
+  Scalar precision = test_precision<Scalar>() / longitudeRadius;
   
-  if (positiveRangeGamma)
+  Scalar betaRangeStart, betaRangeEnd;
+  if (EulerSystem::IsTaitBryan)
   {
-    gammaRangeStart = Scalar(0);
-    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+    betaRangeStart = -HALF_PI;
+    betaRangeEnd = HALF_PI;
   }
   else
   {
-    gammaRangeStart = -Scalar(EIGEN_PI);
-    gammaRangeEnd = Scalar(EIGEN_PI);
+    if (!EulerSystem::IsBetaOpposite)
+    {
+      betaRangeStart = 0;
+      betaRangeEnd = PI;
+    }
+    else
+    {
+      betaRangeStart = -PI;
+      betaRangeEnd = 0;
+    }
   }
   
-  const int i = EulerSystem::AlphaAxisAbs - 1;
-  const int j = EulerSystem::BetaAxisAbs - 1;
-  const int k = EulerSystem::GammaAxisAbs - 1;
+  const Vector3 I_ = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J_ = EulerAnglesType::BetaAxisVector();
+  const Vector3 K_ = EulerAnglesType::GammaAxisVector();
   
-  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
-  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
-  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
-  
-  const Vector3 I = EulerAnglesType::AlphaAxisVector();
-  const Vector3 J = EulerAnglesType::BetaAxisVector();
-  const Vector3 K = EulerAnglesType::GammaAxisVector();
-  
-  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  // Is approx checks
+  VERIFY(e.isApprox(e));
+  VERIFY_IS_APPROX(e, e);
+  VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE));
+
+  const Matrix3 m(e);
+  VERIFY_IS_APPROX(Scalar(m.determinant()), ONE);
+
+  EulerAnglesType ebis(m);
   
-  Matrix3 m(e);
-  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  // When no roll(acting like polar representation), we have the best precision.
+  // One of those cases is when the Euler angles are on the pole, and because it's singular case,
+  //  the computation returns no roll.
+  if (ebis.beta() == 0)
+    precision = test_precision<Scalar>();
   
   // Check that eabis in range
-  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
-  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
-  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI);
+  VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI);
+
+  const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_));
+  VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE);
+  VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix());
+  /*std::cout << "===================\n" <<
+    "e: " << e << std::endl <<
+    "eabis: " << eabis.transpose() << std::endl <<
+    "m: " << m << std::endl <<
+    "mbis: " << mbis << std::endl <<
+    "X: " << (m * Vector3::UnitX()).transpose() << std::endl <<
+    "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/
+  VERIFY(m.isApprox(mbis, precision));
+
+  // Test if ea and eabis are the same
+  // Need to check both singular and non-singular cases
+  // There are two singular cases.
+  // 1. When I==K and sin(ea(1)) == 0
+  // 2. When I!=K and cos(ea(1)) == 0
+
+  // TODO: Make this test work well, and use range saturation function.
+  /*// If I==K, and ea[1]==0, then there no unique solution.
+  // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2.
+  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY_IS_APPROX(ea, eabis);*/
   
-  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  // Quaternions
+  const QuaternionType q(e);
+  ebis = q;
+  const QuaternionType qbis(ebis);
+  VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision));
+  //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
   
-  // Invert the relevant axes
-  eabis2[0] *= iFactor;
-  eabis2[1] *= jFactor;
-  eabis2[2] *= kFactor;
+  // A suggestion for simple product test when will be supported.
+  /*EulerAnglesType e2(PI/2, PI/2, PI/2);
+  Matrix3 m2(e2);
+  VERIFY_IS_APPROX(e*e2, m*m2);*/
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_vec(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2]));
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_vec<+A,+B,+C>(ea);
+  verify_euler_vec<+A,+B,-C>(ea);
+  verify_euler_vec<+A,-B,+C>(ea);
+  verify_euler_vec<+A,-B,-C>(ea);
   
-  // Saturate the angles to the correct range
-  if (positiveRangeAlpha && (eabis2[0] < 0))
-    eabis2[0] += Scalar(2 * EIGEN_PI);
-  if (positiveRangeBeta && (eabis2[1] < 0))
-    eabis2[1] += Scalar(2 * EIGEN_PI);
-  if (positiveRangeGamma && (eabis2[2] < 0))
-    eabis2[2] += Scalar(2 * EIGEN_PI);
+  verify_euler_vec<-A,+B,+C>(ea);
+  verify_euler_vec<-A,+B,-C>(ea);
+  verify_euler_vec<-A,-B,+C>(ea);
+  verify_euler_vec<-A,-B,-C>(ea);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_all_neg<X,Y,Z>(ea);
+  verify_euler_all_neg<X,Y,X>(ea);
+  verify_euler_all_neg<X,Z,Y>(ea);
+  verify_euler_all_neg<X,Z,X>(ea);
   
-  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  verify_euler_all_neg<Y,Z,X>(ea);
+  verify_euler_all_neg<Y,Z,Y>(ea);
+  verify_euler_all_neg<Y,X,Z>(ea);
+  verify_euler_all_neg<Y,X,Y>(ea);
   
-  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
-  VERIFY_IS_APPROX(m,  mbis);
+  verify_euler_all_neg<Z,X,Y>(ea);
+  verify_euler_all_neg<Z,X,Z>(ea);
+  verify_euler_all_neg<Z,Y,X>(ea);
+  verify_euler_all_neg<Z,Y,Z>(ea);
+}
+
+template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
+{
+  typedef Matrix<Scalar,3,1> Vector3;
+  const Scalar PI = Scalar(EIGEN_PI);
   
-  // Tests that are only relevant for no possitive range
-  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2))
   {
-    /* If I==K, and ea[1]==0, then there no unique solution. */ 
-    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
-    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
-      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
-    
-    // approx_or_less_than does not work for 0
-    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+    check_all_var(Vector3(PI/4, singularBeta, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3));
+    check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI));
+    check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4)));
+    check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI));
   }
   
-  // Quaternions
-  QuaternionType q(e);
-  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
-  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
-}
-
-template<typename EulerSystem, typename Scalar>
-void verify_euler(const Matrix<Scalar,3,1>& ea)
-{
-  verify_euler_ranged<EulerSystem>(ea, false, false, false);
-  verify_euler_ranged<EulerSystem>(ea, false, false, true);
-  verify_euler_ranged<EulerSystem>(ea, false, true, false);
-  verify_euler_ranged<EulerSystem>(ea, false, true, true);
-  verify_euler_ranged<EulerSystem>(ea, true, false, false);
-  verify_euler_ranged<EulerSystem>(ea, true, false, true);
-  verify_euler_ranged<EulerSystem>(ea, true, true, false);
-  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+  // This one for sanity, it had a problem with near pole cases in float scalar.
+  check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI));
 }
 
-template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+template<typename Scalar> void eulerangles_manual()
 {
-  verify_euler<EulerSystemXYZ>(ea);
-  verify_euler<EulerSystemXYX>(ea);
-  verify_euler<EulerSystemXZY>(ea);
-  verify_euler<EulerSystemXZX>(ea);
-  
-  verify_euler<EulerSystemYZX>(ea);
-  verify_euler<EulerSystemYZY>(ea);
-  verify_euler<EulerSystemYXZ>(ea);
-  verify_euler<EulerSystemYXY>(ea);
-  
-  verify_euler<EulerSystemZXY>(ea);
-  verify_euler<EulerSystemZXZ>(ea);
-  verify_euler<EulerSystemZYX>(ea);
-  verify_euler<EulerSystemZYZ>(ea);
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,Dynamic,1> VectorX;
+  const Vector3 Zero = Vector3::Zero();
+  const Scalar PI = Scalar(EIGEN_PI);
+  
+  check_all_var(Zero);
+  
+  // singular cases
+  check_singular_cases(PI/2);
+  check_singular_cases(-PI/2);
+  
+  check_singular_cases(Scalar(0));
+  check_singular_cases(Scalar(-0));
+  
+  check_singular_cases(PI);
+  check_singular_cases(-PI);
+  
+  // non-singular cases
+  VectorX alpha = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  VectorX beta =  VectorX::LinSpaced(20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
+  VectorX gamma = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  for (int i = 0; i < alpha.size(); ++i) {
+    for (int j = 0; j < beta.size(); ++j) {
+      for (int k = 0; k < gamma.size(); ++k) {
+        check_all_var(Vector3(alpha(i), beta(j), gamma(k)));
+      }
+    }
+  }
 }
 
-template<typename Scalar> void eulerangles()
+template<typename Scalar> void eulerangles_rand()
 {
   typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,3,1> Vector3;
@@ -199,10 +273,24 @@ template<typename Scalar> void eulerangles()
   check_all_var(ea);
 }
 
-void test_EulerAngles()
+EIGEN_DECLARE_TEST(EulerAngles)
 {
+  // Simple cast test
+  EulerAnglesXYZd onesEd(1, 1, 1);
+  EulerAnglesXYZf onesEf = onesEd.cast<float>();
+  VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
+
+  // Simple Construction from Vector3 test
+  VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
+  
+  CALL_SUBTEST_1( eulerangles_manual<float>() );
+  CALL_SUBTEST_2( eulerangles_manual<double>() );
+  
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( eulerangles<float>() );
-    CALL_SUBTEST_2( eulerangles<double>() );
+    CALL_SUBTEST_3( eulerangles_rand<float>() );
+    CALL_SUBTEST_4( eulerangles_rand<double>() );
   }
+  
+  // TODO: Add tests for auto diff
+  // TODO: Add tests for complex numbers
 }
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index 8b7528fb7..cfe559ebd 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -225,7 +225,7 @@ void test_return_by_value(int len)
     VERIFY( (in1-in).norm() < test_precision<float>() );
 }
 
-void test_FFTW()
+EIGEN_DECLARE_TEST(FFTW)
 {
   CALL_SUBTEST( test_return_by_value(32) );
   //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 1d682dd83..c667b7247 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -15,6 +15,15 @@
 // tolerance for chekcing number of iterations
 #define LM_EVAL_COUNT_TOL 4/3
 
+#define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \
+            ++g_test_level; \
+            VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \
+            VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \
+            --g_test_level; \
+            VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \
+            VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \
+        }
+
 int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
 {
     /*      subroutine fcn for chkder example. */
@@ -180,8 +189,7 @@ void testLmder1()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -209,8 +217,7 @@ void testLmder()
 
   // check return values
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -294,8 +301,7 @@ void testHybrj1()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(solver.nfev, 11);
-  VERIFY_IS_EQUAL(solver.njev, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -329,8 +335,7 @@ void testHybrj()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(solver.nfev, 11);
-  VERIFY_IS_EQUAL(solver.njev, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -485,8 +490,7 @@ void testLmstr1()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -514,8 +518,7 @@ void testLmstr()
 
   // check return values
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -565,7 +568,7 @@ void testLmdif1()
 
   // do the computation
   lmdif_functor functor;
-  DenseIndex nfev;
+  DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning
   info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
 
   // check return value
@@ -686,8 +689,7 @@ void testNistChwirut2(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 10);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 10, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
   // check x
@@ -707,8 +709,7 @@ void testNistChwirut2(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 7);
-  VERIFY_IS_EQUAL(lm.njev, 6);
+  LM_CHECK_N_ITERS(lm, 7, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
   // check x
@@ -766,8 +767,7 @@ void testNistMisra1a(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 19);
-  VERIFY_IS_EQUAL(lm.njev, 15);
+  LM_CHECK_N_ITERS(lm, 19, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
   // check x
@@ -783,8 +783,7 @@ void testNistMisra1a(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 5);
-  VERIFY_IS_EQUAL(lm.njev, 4);
+  LM_CHECK_N_ITERS(lm, 5, 4);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
   // check x
@@ -856,8 +855,7 @@ void testNistHahn1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 11);
-  VERIFY_IS_EQUAL(lm.njev, 10);
+  LM_CHECK_N_ITERS(lm, 11, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
   // check x
@@ -878,8 +876,7 @@ void testNistHahn1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 11);
-  VERIFY_IS_EQUAL(lm.njev, 10);
+  LM_CHECK_N_ITERS(lm, 11, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
   // check x
@@ -942,8 +939,7 @@ void testNistMisra1d(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 3);
-  VERIFY_IS_EQUAL(lm.nfev, 9);
-  VERIFY_IS_EQUAL(lm.njev, 7);
+  LM_CHECK_N_ITERS(lm, 9, 7);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
   // check x
@@ -959,8 +955,7 @@ void testNistMisra1d(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 4);
-  VERIFY_IS_EQUAL(lm.njev, 3);
+  LM_CHECK_N_ITERS(lm, 4, 3);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
   // check x
@@ -1020,8 +1015,7 @@ void testNistLanczos1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 2);
-  VERIFY_IS_EQUAL(lm.nfev, 79);
-  VERIFY_IS_EQUAL(lm.njev, 72);
+  LM_CHECK_N_ITERS(lm, 79, 72);
   // check norm^2
   std::cout.precision(30);
   std::cout << lm.fvec.squaredNorm() << "\n";
@@ -1043,8 +1037,7 @@ void testNistLanczos1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 2);
-  VERIFY_IS_EQUAL(lm.nfev, 9);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 9, 8);
   // check norm^2
   VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
@@ -1108,8 +1101,7 @@ void testNistRat42(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 10);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 10, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
   // check x
@@ -1126,8 +1118,7 @@ void testNistRat42(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
   // check x
@@ -1186,8 +1177,7 @@ void testNistMGH10(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  VERIFY_IS_EQUAL(lm.nfev, 284 ); 
-  VERIFY_IS_EQUAL(lm.njev, 249 ); 
+  LM_CHECK_N_ITERS(lm, 284, 249); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
   // check x
@@ -1204,8 +1194,7 @@ void testNistMGH10(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 3);
-  VERIFY_IS_EQUAL(lm.nfev, 126);
-  VERIFY_IS_EQUAL(lm.njev, 116);
+  LM_CHECK_N_ITERS(lm, 126, 116);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
   // check x
@@ -1265,8 +1254,7 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY(lm.nfev < 31); // 31
-  VERIFY(lm.njev < 25); // 25
+  LM_CHECK_N_ITERS(lm, 31, 25);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1284,9 +1272,8 @@ void testNistBoxBOD(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev, 15 ); 
-  VERIFY_IS_EQUAL(lm.njev, 14 ); 
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 15, 14);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1356,12 +1343,7 @@ void testNistMGH17(void)
   
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  ++g_test_level;
-  VERIFY_IS_EQUAL(lm.nfev, 602);  // 602
-  VERIFY_IS_EQUAL(lm.njev, 545);  // 545
-  --g_test_level;
-  VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
-  VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
+  LM_CHECK_N_ITERS(lm, 602, 545);
 
   /*
    * Second try
@@ -1373,8 +1355,7 @@ void testNistMGH17(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 18);
-  VERIFY_IS_EQUAL(lm.njev, 15);
+  LM_CHECK_N_ITERS(lm, 18, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
   // check x
@@ -1438,9 +1419,8 @@ void testNistMGH09(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev, 490 ); 
-  VERIFY_IS_EQUAL(lm.njev, 376 ); 
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 490, 376);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1459,8 +1439,7 @@ void testNistMGH09(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 18);
-  VERIFY_IS_EQUAL(lm.njev, 16);
+  LM_CHECK_N_ITERS(lm, 18, 16);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1525,8 +1504,7 @@ void testNistBennett5(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 758);
-  VERIFY_IS_EQUAL(lm.njev, 744);
+  LM_CHECK_N_ITERS(lm, 758, 744);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1543,8 +1521,7 @@ void testNistBennett5(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 203);
-  VERIFY_IS_EQUAL(lm.njev, 192);
+  LM_CHECK_N_ITERS(lm, 203, 192);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1613,8 +1590,7 @@ void testNistThurber(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 39);
-  VERIFY_IS_EQUAL(lm.njev, 36);
+  LM_CHECK_N_ITERS(lm, 39,36);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1638,8 +1614,7 @@ void testNistThurber(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 29);
-  VERIFY_IS_EQUAL(lm.njev, 28);
+  LM_CHECK_N_ITERS(lm, 29, 28);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1705,8 +1680,7 @@ void testNistRat43(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 27);
-  VERIFY_IS_EQUAL(lm.njev, 20);
+  LM_CHECK_N_ITERS(lm, 27, 20);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1727,8 +1701,7 @@ void testNistRat43(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 9);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 9, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1790,8 +1763,7 @@ void testNistEckerle4(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 18);
-  VERIFY_IS_EQUAL(lm.njev, 15);
+  LM_CHECK_N_ITERS(lm, 18, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
   // check x
@@ -1808,8 +1780,7 @@ void testNistEckerle4(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 7);
-  VERIFY_IS_EQUAL(lm.njev, 6);
+  LM_CHECK_N_ITERS(lm, 7, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
   // check x
@@ -1818,7 +1789,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_NonLinearOptimization()
+EIGEN_DECLARE_TEST(NonLinearOptimization)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST/*_1*/(testChkder());
diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp
index 27d888056..6d836413b 100644
--- a/unsupported/test/NumericalDiff.cpp
+++ b/unsupported/test/NumericalDiff.cpp
@@ -24,7 +24,7 @@ struct Functor
   int m_inputs, m_values;
   
   Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  Functor(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
   
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -107,7 +107,7 @@ void test_central()
     VERIFY_IS_APPROX(jac, actual_jac);
 }
 
-void test_NumericalDiff()
+EIGEN_DECLARE_TEST(NumericalDiff)
 {
     CALL_SUBTEST(test_forward());
     CALL_SUBTEST(test_central());
diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp
index 252cb1d3f..f442e416a 100644
--- a/unsupported/test/alignedvector3.cpp
+++ b/unsupported/test/alignedvector3.cpp
@@ -70,13 +70,16 @@ void alignedvector3()
     VERIFY_IS_APPROX(f6,r1-r4);
   }
   
+  FastType f8, f9(0,0,0);
+  VERIFY_IS_APPROX(f9-f1,-f1);
+
   std::stringstream ss1, ss2;
   ss1 << f1;
   ss2 << r1;
   VERIFY(ss1.str()==ss2.str());
 }
 
-void test_alignedvector3()
+EIGEN_DECLARE_TEST(alignedvector3)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST( alignedvector3<float>() );
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 85743137e..2cea56ba5 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -44,7 +44,7 @@ struct TestFunc1
   int m_inputs, m_values;
 
   TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -306,6 +306,8 @@ double bug_1222() {
   return denom.value();
 }
 
+#ifdef EIGEN_TEST_PART_5
+
 double bug_1223() {
   using std::min;
   typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
@@ -326,8 +328,8 @@ double bug_1223() {
 
 // regression test for some compilation issues with specializations of ScalarBinaryOpTraits
 void bug_1260() {
-  Matrix4d A;
-  Vector4d v;
+  Matrix4d A = Matrix4d::Ones();
+  Vector4d v = Vector4d::Ones();
   A*v;
 }
 
@@ -336,7 +338,7 @@ double bug_1261() {
   typedef AutoDiffScalar<Matrix2d> AD;
   typedef Matrix<AD,2,1> VectorAD;
 
-  VectorAD v;
+  VectorAD v(0.,0.);
   const AD maxVal = v.maxCoeff();
   const AD minVal = v.minCoeff();
   return maxVal.value() + minVal.value();
@@ -344,13 +346,30 @@ double bug_1261() {
 
 double bug_1264() {
   typedef AutoDiffScalar<Vector2d> AD;
-  const AD s;
-  const Matrix<AD, 3, 1> v1;
+  const AD s = 0.;
+  const Matrix<AD, 3, 1> v1(0.,0.,0.);
   const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
   return v2(0).value();
 }
 
-void test_autodiff()
+// check with expressions on constants
+double bug_1281() {
+  int n = 2;
+  typedef AutoDiffScalar<VectorXd> AD;
+  const AD c = 1.;
+  AD x0(2,n,0);
+  AD y1 = (AD(c)+AD(c))*x0;
+  y1 = x0 * (AD(c)+AD(c));
+  AD y2 = (-AD(c))+x0;
+  y2 = x0+(-AD(c));
+  AD y3 = (AD(c)*(-AD(c))+AD(c))*x0;
+  y3 = x0 * (AD(c)*(-AD(c))+AD(c));
+  return (y1+y2+y3).value();
+}
+
+#endif
+
+EIGEN_DECLARE_TEST(autodiff)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( test_autodiff_scalar<1>() );
@@ -359,9 +378,10 @@ void test_autodiff()
     CALL_SUBTEST_4( test_autodiff_hessian<1>() );
   }
 
-  bug_1222();
-  bug_1223();
-  bug_1260();
-  bug_1261();
+  CALL_SUBTEST_5( bug_1222() );
+  CALL_SUBTEST_5( bug_1223() );
+  CALL_SUBTEST_5( bug_1260() );
+  CALL_SUBTEST_5( bug_1261() );
+  CALL_SUBTEST_5( bug_1281() );
 }
 
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index 9cf11280c..e81a7788b 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -81,12 +81,15 @@ void check_limits_specialization()
   typedef std::numeric_limits<AD> A;
   typedef std::numeric_limits<Scalar> B;
 
+  // workaround "unused typedef" warning:
+  VERIFY(!bool(internal::is_same<B, A>::value));
+
 #if EIGEN_HAS_CXX11
   VERIFY(bool(std::is_base_of<B, A>::value));
 #endif
 }
 
-void test_autodiff_scalar()
+EIGEN_DECLARE_TEST(autodiff_scalar)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_atan2<float>() );
diff --git a/unsupported/test/bessel_functions.cpp b/unsupported/test/bessel_functions.cpp
new file mode 100644
index 000000000..06765bfab
--- /dev/null
+++ b/unsupported/test/bessel_functions.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i))) {
+      VERIFY_IS_APPROX( x(i), y(i) );
+    }
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_bessel_functions() 
+{
+  // Test Bessel function i0. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 4.35582826e+07, 6.21841242e+06, 8.93446228e+05, 1.29418563e+05,
+       1.89489253e+04, 2.81571663e+03, 4.27564116e+02, 6.72344070e+01,
+       1.13019220e+01, 2.27958530e+00, 1.00000000e+00, 2.27958530e+00,
+       1.13019220e+01, 6.72344070e+01, 4.27564116e+02, 2.81571663e+03,
+       1.89489253e+04, 1.29418563e+05, 8.93446228e+05, 6.21841242e+06,
+       4.35582826e+07;
+
+    CALL_SUBTEST(res = bessel_i0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 0.0897803118848, 0.0947062952128, 0.100544127361,
+        0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+        0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+        0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+        0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+        0.0897803118848;
+
+    CALL_SUBTEST(res = bessel_i0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -4.24549734e+07, -6.04313324e+06, -8.65059436e+05, -1.24707259e+05,
+       -1.81413488e+04, -2.67098830e+03, -3.99873137e+02, -6.13419368e+01,
+       -9.75946515e+00, -1.59063685e+00,  0.00000000e+00,  1.59063685e+00,
+        9.75946515e+00,  6.13419368e+01,  3.99873137e+02,  2.67098830e+03,
+        1.81413488e+04,  1.24707259e+05,  8.65059436e+05,  6.04313324e+06,
+        4.24549734e+07;
+
+    CALL_SUBTEST(res = bessel_i1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -0.0875062221833, -0.092036796872, -0.0973496147565,
+        -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+        -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+        0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+        0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+        0.0875062221833;
+
+    CALL_SUBTEST(res = bessel_i1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j0. Reference results obtained with SciPy.
+  {
+    ArrayType x(77);
+    ArrayType expected(77);
+    ArrayType res(77);
+
+    x << -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.;
+
+    expected << 0.11433274,  0.01086237, -0.10556738,
+             -0.12684568, -0.03042119,  0.09727067,  0.13807901,  0.05120815,
+             -0.08636798, -0.14784876, -0.07315701,  0.07274192,  0.15599932,
+              0.09626678, -0.05623027, -0.16241278, -0.12065148,  0.03657907,
+              0.16702466,  0.14662944, -0.01335581, -0.16985425, -0.17489907,
+             -0.01422447,  0.17107348,  0.2069261 ,  0.04768931, -0.1711903 ,
+             -0.24593576, -0.09033361,  0.17165081,  0.30007927,  0.15064526,
+             -0.17759677, -0.39714981, -0.26005195,  0.22389078,  0.76519769,
+              1.        ,  0.76519769,  0.22389078, -0.26005195, -0.39714981,
+             -0.17759677,  0.15064526,  0.30007927,  0.17165081, -0.09033361,
+             -0.24593576, -0.1711903 ,  0.04768931,  0.2069261 ,  0.17107348,
+             -0.01422447, -0.17489907, -0.16985425, -0.01335581,  0.14662944,
+              0.16702466,  0.03657907, -0.12065148, -0.16241278, -0.05623027,
+              0.09626678,  0.15599932,  0.07274192, -0.07315701, -0.14784876,
+             -0.08636798,  0.05120815,  0.13807901,  0.09727067, -0.03042119,
+             -0.12684568, -0.10556738,  0.01086237,  0.11433274;
+
+    CALL_SUBTEST(res = bessel_j0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j1. Reference results obtained with SciPy.
+  {
+    ArrayType x(81);
+    ArrayType expected(81);
+    ArrayType res(81);
+
+    x << -40., -39., -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.,  39.,  40.;
+
+    expected << -0.12603832, -0.0640561 ,  0.05916189,  0.13058004,  0.08232981,
+             -0.04399094, -0.13297118, -0.10061965,  0.02658903,  0.13302432,
+              0.11875106, -0.0069342 , -0.13055149, -0.13658472, -0.01504573,
+              0.12535025,  0.15403807,  0.03951932, -0.11717779, -0.17112027,
+             -0.06683312,  0.10570143,  0.18799489,  0.09766849, -0.09039718,
+             -0.20510404, -0.13337515,  0.07031805,  0.2234471 ,  0.1767853 ,
+             -0.04347275, -0.24531179, -0.23463635,  0.00468282,  0.27668386,
+              0.32757914,  0.06604333, -0.33905896, -0.57672481, -0.44005059,
+              0.        ,  0.44005059,  0.57672481,  0.33905896, -0.06604333,
+             -0.32757914, -0.27668386, -0.00468282,  0.23463635,  0.24531179,
+              0.04347275, -0.1767853 , -0.2234471 , -0.07031805,  0.13337515,
+              0.20510404,  0.09039718, -0.09766849, -0.18799489, -0.10570143,
+              0.06683312,  0.17112027,  0.11717779, -0.03951932, -0.15403807,
+             -0.12535025,  0.01504573,  0.13658472,  0.13055149,  0.0069342 ,
+             -0.11875106, -0.13302432, -0.02658903,  0.10061965,  0.13297118,
+              0.04399094, -0.08232981, -0.13058004, -0.05916189,  0.0640561 ,
+              0.12603832;
+
+    CALL_SUBTEST(res = bessel_j1(x);
+                 verify_component_wise(res, expected););
+  }
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313, 0.4658451 ,
+             0.43662302, 0.41229555, 0.39163193, 0.3737955 , 0.35819488,
+             0.34439865, 0.33208364, 0.32100235, 0.31096159, 0.30180802,
+             0.29341821, 0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801, 0.24010616,
+             0.23581722, 0.23175022, 0.22788667, 0.22421014, 0.22070602,
+             0.21736123, 0.21416406, 0.21110397, 0.20817141, 0.20535778,
+             0.20265524, 0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.54150675, 0.92441907, 4.21024438e-01, 1.13893873e-01,
+             3.47395044e-02, 1.11596761e-02, 3.69109833e-03, 1.24399433e-03,
+             4.24795742e-04, 1.46470705e-04, 5.08813130e-05, 1.77800623e-05,
+             6.24302055e-06, 2.20082540e-06, 7.78454386e-07, 2.76137082e-07,
+             9.81953648e-08, 3.49941166e-08, 1.24946640e-08, 4.46875334e-09,
+             1.60067129e-09, 5.74123782e-10, 2.06176797e-10, 7.41235161e-11,
+             2.66754511e-11, 9.60881878e-12, 3.46416156e-12, 1.24987740e-12,
+             4.51286453e-13, 1.63053459e-13, 5.89495073e-14, 2.13247750e-14,
+             7.71838266e-15, 2.79505752e-15, 1.01266123e-15, 3.67057597e-16,
+             1.33103515e-16, 4.82858338e-17, 1.75232770e-17, 6.36161716e-18,
+             2.31029936e-18, 8.39286110e-19;
+
+    CALL_SUBTEST(res = bessel_k0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313,
+             0.4658451 , 0.43662302, 0.41229555, 0.39163193,
+             0.3737955 , 0.35819488, 0.34439865, 0.33208364,
+             0.32100235, 0.31096159, 0.30180802, 0.29341821,
+             0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801,
+             0.24010616, 0.23581722, 0.23175022, 0.22788667,
+             0.22421014, 0.22070602, 0.21736123, 0.21416406,
+             0.21110397, 0.20817141, 0.20535778, 0.20265524,
+             0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 3.74702597, 1.65644112, 6.01907230e-01, 1.39865882e-01,
+             4.01564311e-02, 1.24834989e-02, 4.04461345e-03, 1.34391972e-03,
+             4.54182487e-04, 1.55369212e-04, 5.36370164e-05, 1.86487735e-05,
+             6.52086067e-06, 2.29075746e-06, 8.07858841e-07, 2.85834365e-07,
+             1.01417294e-07, 3.60715712e-08, 1.28570417e-08, 4.59124963e-09,
+             1.64226697e-09, 5.88305797e-10, 2.11029922e-10, 7.57898116e-11,
+             2.72493059e-11, 9.80699893e-12, 3.53277807e-12, 1.27369078e-12,
+             4.59568940e-13, 1.65940011e-13, 5.99574032e-14, 2.16773200e-14,
+             7.84189960e-15, 2.83839927e-15, 1.02789171e-15, 3.72416929e-16,
+             1.34991783e-16, 4.89519373e-17, 1.77585196e-17, 6.44478588e-18,
+             2.33973340e-18, 8.49713195e-19;
+
+    CALL_SUBTEST(res = bessel_k1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 4.81127659, 2.73100971, 1.63615349, 1.03347685,
+             0.80656348, 0.68157595, 0.60027386, 0.54217591,
+             0.49807158, 0.46314909, 0.43462525, 0.41076657,
+             0.39043094, 0.37283175, 0.35740757, 0.34374563,
+             0.33153489, 0.32053597, 0.31056123, 0.30146131,
+             0.29311559, 0.2854255 , 0.27830958, 0.27169987,
+             0.26553913, 0.25977879, 0.25437733, 0.249299  ,
+             0.24451285, 0.23999191, 0.2357126 , 0.23165413,
+             0.22779816, 0.22412841, 0.22063036, 0.21729103,
+             0.21409878, 0.21104314, 0.20811462, 0.20530466,
+             0.20260547, 0.20000997;
+
+    CALL_SUBTEST(res = bessel_k1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -0.93157302, -0.44451873, 0.08825696,  0.51037567,  0.37685001,
+             -0.01694074, -0.30851763, -0.28819468, -0.02594974,  0.22352149,
+             0.2499367 ,  0.05567117, -0.16884732, -0.22523731, -0.07820786,
+             0.12719257,  0.2054643 , 0.095811  , -0.0926372 , -0.18755216,
+             -0.10951969,  0.0626406 , 0.17020176,  0.1198876 , -0.03598179,
+             -0.15283403, -0.12724943, 0.01204463,  0.13521498,  0.13183647,
+             0.00948116, -0.11729573, -0.13383266, -0.02874248,  0.09913483,
+             0.13340405,  0.04579799, -0.08085609, -0.13071488, -0.06066076,
+             0.06262353,  0.12593642;
+
+    CALL_SUBTEST(res = bessel_y0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -2.70410523, -1.47147239, -0.78121282, -0.10703243,
+             0.32467442,  0.39792571,  0.14786314, -0.17501034, -0.30266724,
+             -0.15806046,  0.10431458,  0.24901542, 0.16370554, -0.05709922,
+             -0.21008141, -0.16664484,  0.02107363, 0.17797517,  0.16720504,
+             0.00815513, -0.14956011, -0.16551161, -0.03253926,  0.12340586,
+             0.1616692 ,  0.05305978, -0.09882996, -0.15579655, -0.07025124,
+             0.07552213,  0.14803412,  0.08442557, -0.05337283, -0.13854483,
+             -0.09578012,  0.03238588,  0.12751273, 0.10445477, -0.01262946,
+             -0.11514066, -0.11056411, -0.00579351;
+
+    CALL_SUBTEST(res = bessel_y1(x);
+                 verify_component_wise(res, expected););
+  }
+}
+
+EIGEN_DECLARE_TEST(bessel_functions)
+{
+  CALL_SUBTEST_1(array_bessel_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_bessel_functions<ArrayXd>());
+}
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 3b598bf42..7bf4e965f 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -30,11 +30,11 @@ static void test_basic_eventcount()
   EventCount ec(waiters);
   EventCount::Waiter& w = waiters[0];
   ec.Notify(false);
-  ec.Prewait(&w);
+  ec.Prewait();
   ec.Notify(true);
   ec.CommitWait(&w);
-  ec.Prewait(&w);
-  ec.CancelWait(&w);
+  ec.Prewait();
+  ec.CancelWait();
 }
 
 // Fake bounded counter-based queue.
@@ -112,7 +112,7 @@ static void test_stress_eventcount()
         unsigned idx = rand_reentrant(&rnd) % kQueues;
         if (queues[idx].Pop()) continue;
         j--;
-        ec.Prewait(&w);
+        ec.Prewait();
         bool empty = true;
         for (int q = 0; q < kQueues; q++) {
           if (!queues[q].Empty()) {
@@ -121,7 +121,7 @@ static void test_stress_eventcount()
           }
         }
         if (!empty) {
-          ec.CancelWait(&w);
+          ec.CancelWait();
           continue;
         }
         ec.CommitWait(&w);
@@ -135,7 +135,7 @@ static void test_stress_eventcount()
   }
 }
 
-void test_cxx11_eventcount()
+EIGEN_DECLARE_TEST(cxx11_eventcount)
 {
   CALL_SUBTEST(test_basic_eventcount());
   CALL_SUBTEST(test_stress_eventcount());
diff --git a/unsupported/test/cxx11_maxsizevector.cpp b/unsupported/test/cxx11_maxsizevector.cpp
new file mode 100644
index 000000000..46b689a8e
--- /dev/null
+++ b/unsupported/test/cxx11_maxsizevector.cpp
@@ -0,0 +1,77 @@
+#include "main.h"
+
+#include <exception>  // std::exception
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+struct Foo
+{
+  static Index object_count;
+  static Index object_limit;
+  EIGEN_ALIGN_TO_BOUNDARY(128) int dummy;
+
+  Foo(int x=0) : dummy(x)
+  {
+#ifdef EIGEN_EXCEPTIONS
+    // TODO: Is this the correct way to handle this?
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+    std::cout << '+';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+  Foo(const Foo&)
+  {
+    std::cout << 'c';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+
+  ~Foo()
+  {
+    std::cout << '~';
+    --Foo::object_count;
+  }
+
+  class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+
+
+EIGEN_DECLARE_TEST(cxx11_maxsizevector)
+{
+  typedef MaxSizeVector<Foo> VectorX;
+  Foo::object_count = 0;
+  for(int r = 0; r < g_repeat; r++) {
+    Index rows = internal::random<Index>(3,30);
+    Foo::object_limit = internal::random<Index>(0, rows - 2);
+    std::cout << "object_limit = " << Foo::object_limit << std::endl;
+    bool exception_raised = false;
+#ifdef EIGEN_EXCEPTIONS
+    try
+    {
+#endif
+      std::cout <<       "\nVectorX m(" << rows << ");\n";
+      VectorX vect(rows);
+      for(int i=0; i<rows; ++i)
+          vect.push_back(Foo());
+#ifdef EIGEN_EXCEPTIONS
+      VERIFY(false);  // not reached if exceptions are enabled
+    }
+    catch (const Foo::Fail&) { exception_raised = true; }
+    VERIFY(exception_raised);
+#endif
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = rows+1;
+      VectorX vect2(rows, Foo());
+      VERIFY_IS_EQUAL(Foo::object_count, rows);
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+    std::cout << '\n';
+  }
+}
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index 8911c59d8..510e11032 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -340,7 +340,7 @@ static void test_array_misc()
   VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
 }
 
-void test_cxx11_meta()
+EIGEN_DECLARE_TEST(cxx11_meta)
 {
   CALL_SUBTEST(test_gen_numeric_list());
   CALL_SUBTEST(test_concat());
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 5f9bb938b..993ee1789 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -11,22 +11,23 @@
 #define EIGEN_USE_THREADS
 #include "main.h"
 #include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"
 
 static void test_create_destroy_empty_pool()
 {
   // Just create and destroy the pool. This will wind up and tear down worker
   // threads. Ensure there are no issues in that logic.
   for (int i = 0; i < 16; ++i) {
-    NonBlockingThreadPool tp(i);
+    ThreadPool tp(i);
   }
 }
 
 
-static void test_parallelism()
+static void test_parallelism(bool allow_spinning)
 {
   // Test we never-ever fail to match available tasks with idle threads.
   const int kThreads = 16;  // code below expects that this is a multiple of 4
-  NonBlockingThreadPool tp(kThreads);
+  ThreadPool tp(kThreads, allow_spinning);
   VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
   VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
   for (int iter = 0; iter < 100; ++iter) {
@@ -100,8 +101,80 @@ static void test_parallelism()
   }
 }
 
-void test_cxx11_non_blocking_thread_pool()
+
+static void test_cancel()
+{
+  ThreadPool tp(2);
+
+  // Schedule a large number of closure that each sleeps for one second. This
+  // will keep the thread pool busy for much longer than the default test timeout.
+  for (int i = 0; i < 1000; ++i) {
+    tp.Schedule([]() {
+      std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+    });
+  }
+
+  // Cancel the processing of all the closures that are still pending.
+  tp.Cancel();
+}
+
+static void test_pool_partitions() {
+  const int kThreads = 2;
+  ThreadPool tp(kThreads);
+
+  // Assign each thread to its own partition, so that stealing other work only
+  // occurs globally when a thread is idle.
+  std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+  for (int i = 0; i < kThreads; ++i) {
+    steal_partitions[i] = std::make_pair(i, i + 1);
+  }
+  tp.SetStealPartitions(steal_partitions);
+
+  std::atomic<int> running(0);
+  std::atomic<int> done(0);
+  std::atomic<int> phase(0);
+
+  // Schedule kThreads tasks and ensure that they all are running.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.Schedule([&]() {
+      const int thread_id = tp.CurrentThreadId();
+      VERIFY_GE(thread_id, 0);
+      VERIFY_LE(thread_id, kThreads - 1);
+      ++running;
+      while (phase < 1) {
+      }
+      ++done;
+    });
+  }
+  while (running != kThreads) {
+  }
+  // Schedule each closure to only run on thread 'i' and verify that it does.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.ScheduleWithHint(
+        [&, i]() {
+          ++running;
+          const int thread_id = tp.CurrentThreadId();
+          VERIFY_IS_EQUAL(thread_id, i);
+          while (phase < 2) {
+          }
+          ++done;
+        },
+        i, i + 1);
+  }
+  running = 0;
+  phase = 1;
+  while (running != kThreads) {
+  }
+  running = 0;
+  phase = 2;
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool)
 {
   CALL_SUBTEST(test_create_destroy_empty_pool());
-  CALL_SUBTEST(test_parallelism());
+  CALL_SUBTEST(test_parallelism(true));
+  CALL_SUBTEST(test_parallelism(false));
+  CALL_SUBTEST(test_cancel());
+  CALL_SUBTEST(test_pool_partitions());
 }
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 91f690114..8fc5a3074 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -227,7 +227,7 @@ void test_stress_runqueue()
   VERIFY(total.load() == 0);
 }
 
-void test_cxx11_runqueue()
+EIGEN_DECLARE_TEST(cxx11_runqueue)
 {
   CALL_SUBTEST_1(test_basic_runqueue());
   CALL_SUBTEST_2(test_empty_runqueue());
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index 037767270..4a0c8967b 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -273,7 +273,7 @@ static void test_argmin_dim()
   }
 }
 
-void test_cxx11_tensor_argmax()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax)
 {
   CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
   CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 653443dc5..79f4066e9 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -9,19 +9,18 @@
 
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 
 template <int Layout>
-void test_cuda_simple_argmax()
+void test_gpu_simple_argmax()
 {
   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
   Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@@ -37,13 +36,13 @@ void test_cuda_simple_argmax()
   double* d_in;
   DenseIndex* d_out_max;
   DenseIndex* d_out_min;
-  cudaMalloc((void**)(&d_in), in_bytes);
-  cudaMalloc((void**)(&d_out_max), out_bytes);
-  cudaMalloc((void**)(&d_out_min), out_bytes);
+  gpuMalloc((void**)(&d_in), in_bytes);
+  gpuMalloc((void**)(&d_out_max), out_bytes);
+  gpuMalloc((void**)(&d_out_min), out_bytes);
 
-  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@@ -53,20 +52,20 @@ void test_cuda_simple_argmax()
   gpu_out_max.device(gpu_device) = gpu_in.argmax();
   gpu_out_min.device(gpu_device) = gpu_in.argmin();
 
-  assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
 
-  cudaFree(d_in);
-  cudaFree(d_out_max);
-  cudaFree(d_out_min);
+  gpuFree(d_in);
+  gpuFree(d_out_max);
+  gpuFree(d_out_min);
 }
 
 template <int DataLayout>
-void test_cuda_argmax_dim()
+void test_gpu_argmax_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -100,12 +99,12 @@ void test_cuda_argmax_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -113,8 +112,8 @@ void test_cuda_argmax_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     size_t(2*3*5*7 / tensor.dimension(dim)));
@@ -137,25 +136,25 @@ void test_cuda_argmax_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
 template <int DataLayout>
-void test_cuda_argmin_dim()
+void test_gpu_argmin_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -189,12 +188,12 @@ void test_cuda_argmin_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -202,8 +201,8 @@ void test_cuda_argmin_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     2*3*5*7 / tensor.dimension(dim));
@@ -226,29 +225,29 @@ void test_cuda_argmin_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
-  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 000000000..7ac71286e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(1, 1, 1) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(DataType);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+  DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
+
+  gpu_out_max.device(sycl_device) = gpu_in.argmax();
+  gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+  sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+  sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
+
+  sycl_device.deallocate(d_in);
+  sycl_device.deallocate(d_out_max);
+  sycl_device.deallocate(d_out_min);
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+            // = 10.0
+            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index 8fe85d83c..ce9d24369 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -358,7 +358,7 @@ static void test_std_initializers_tensor() {
 #endif  // EIGEN_HAS_VARIADIC_TEMPLATES
 }
 
-void test_cxx11_tensor_assign()
+EIGEN_DECLARE_TEST(cxx11_tensor_assign)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
new file mode 100644
index 000000000..5fb12e0e0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -0,0 +1,576 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <algorithm>
+#include <set>
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TensorBlockShapeType;
+
+static TensorOpCost zeroCost() { return {0, 0, 0}; }
+
+template<typename T>
+static const T& choose(int layout, const T& col, const T& row) {
+  return layout == ColMajor ? col : row;
+}
+
+static TensorBlockShapeType RandomShape() {
+  return internal::random<bool>()
+         ? TensorBlockShapeType::kUniformAllDims
+         : TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(1, 20);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+template <typename T>
+static T* GenerateRandomData(const Index& size) {
+  T* data = new T[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = internal::random<T>();
+  }
+  return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+  for (int i = 0; i < NumDims; ++i) {
+    std::cout << dims[i] << "; ";
+  }
+  std::cout << std::endl;
+}
+
+template <int Layout>
+static void test_block_mapper_sanity()
+{
+  typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+  DSizes<Index, 2> tensor_dims(100, 100);
+
+  // Test uniform blocks.
+  TensorBlockMapper uniform_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
+
+  // 10x10 blocks
+  auto uniform_b0 = uniform_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(0), 10);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(1), 10);
+
+  // Test skewed to inner dims blocks.
+  TensorBlockMapper skewed_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
+
+  // 1x100 (100x1) rows/cols depending on a tensor layout.
+  auto skewed_b0 = skewed_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(0), choose(Layout, 100, 1));
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(1), choose(Layout, 1, 100));
+}
+
+// Given a TensorBlock "visit" every element accessible though it, and a keep an
+// index in the visited set. Verify that every coeff accessed only once.
+template<int NumDims, int Layout>
+static void UpdateCoeffSet(
+    const DSizes<Index, NumDims>& tensor_strides,
+    const internal::TensorBlockDescriptor<NumDims>& block,
+    Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
+  const DSizes<Index, NumDims>& block_sizes = block.dimensions();
+
+  for (int i = 0; i < block_sizes[dim_index]; ++i) {
+    if (tensor_strides[dim_index] == 1) {
+      typedef std::pair<std::set<Index>::iterator, bool> ReturnType;
+      ReturnType inserted = visited_coeffs->insert(first_coeff_index + i);
+      VERIFY_IS_EQUAL(inserted.second, true);
+    } else {
+      int next_dim_index = dim_index + choose(Layout, -1, 1);
+      UpdateCoeffSet<NumDims, Layout>(tensor_strides, block, first_coeff_index,
+                                         next_dim_index, visited_coeffs);
+      first_coeff_index += tensor_strides[dim_index];
+    }
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+  typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides = internal::strides<Layout>(dims);
+
+  // Keep track of elements indices available via block access.
+  std::set<Index> coeff_set;
+
+  // Try different combinations of block types and sizes.
+  TensorBlockMapper block_mapper(
+      dims, {RandomShape(), RandomTargetSize(dims), zeroCost()});
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto block = block_mapper.blockDescriptor(i);
+    UpdateCoeffSet<NumDims, Layout>(strides, block, block.offset(),
+                                    choose(Layout, NumDims - 1, 0),
+                                    &coeff_set);
+  }
+
+  // Verify that every coefficient in the original Tensor is accessible through
+  // TensorBlock only once.
+  Index total_coeffs = dims.TotalSize();
+  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
+  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                         const array<Index, NumDims>& output_to_input_dim_map,
+                         const array<Index, NumDims>& input_strides,
+                         const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <int Layout, int NumDims>
+static array<Index, NumDims> ComputeStrides(
+    const array<Index, NumDims>& sizes) {
+  array<Index, NumDims> strides;
+  if (Layout == ColMajor) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * sizes[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+  }
+  return strides;
+}
+
+template<typename Scalar, typename StorageIndex, int Dim>
+class EqualityChecker
+{
+    const Scalar* input_data;
+    const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides;
+    void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const
+    {
+        if(depth==Dim)
+        {
+            VERIFY_IS_EQUAL(*input, *output);
+            return;
+        }
+
+        for(int i=0; i<output_dims[depth]; ++i)
+        {
+            check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1);
+        }
+    }
+public:
+    EqualityChecker(const Scalar* input_data_,
+            const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_,
+            const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_)
+        : input_data(input_data_)
+        , input_dims(input_dims_), input_strides(input_strides_)
+        , output_dims(output_dims_), output_strides(output_strides_)
+        {}
+
+    void operator()(const Scalar* output_data) const
+    {
+        check_recursive(input_data, output_data);
+    }
+};
+
+template <int Layout>
+static void test_uniform_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  {
+    // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    for (int i = 0; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // partially into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first few inner-most dimensions.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(9, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_skewed_inner_dim_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(10, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to second inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(3, block.dimensions()[1]);
+    for (int i = 2; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(15, block.dimensions()[3]);
+    for (int i = 2; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to third inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 3; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 1; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
+{
+  // Test blocking of tensors with zero dimensions:
+  //  - we must not crash on asserts and divisions by zero
+  //  - we must not return block with zero dimensions
+  //    (recipe for overflows/underflows, divisions by zero and NaNs later)
+  //  - total block count must be zero
+  {
+    typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper;
+
+    DSizes<Index, 1> dims(0);
+    for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+      TensorBlockMapper block_mapper(
+          dims, {block_shape, max_coeff_count, zeroCost()});
+      VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+      VERIFY(block_mapper.blockTotalSize() >= 1);
+    }
+  }
+
+  {
+    typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+    for (int dim1 = 0; dim1 < 3; ++dim1) {
+      for (int dim2 = 0; dim2 < 3; ++dim2) {
+        DSizes<Index, 2> dims(dim1, dim2);
+        for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+          TensorBlockMapper block_mapper(
+              dims, {block_shape, max_coeff_count, zeroCost()});
+          if (dim1 * dim2 == 0) {
+            VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+          }
+          VERIFY(block_mapper.blockTotalSize() >= 1);
+        }
+      }
+    }
+  }
+}
+
+#define TEST_LAYOUTS(NAME) \
+  CALL_SUBTEST(NAME<ColMajor>()); \
+  CALL_SUBTEST(NAME<RowMajor>())
+
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME)    \
+  CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+  CALL_SUBTEST(NAME<ColMajor>(ARG)); \
+  CALL_SUBTEST(NAME<RowMajor>(ARG))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
+  TEST_LAYOUTS(test_block_mapper_sanity);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+  TEST_LAYOUTS(test_uniform_block_shape);
+  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+}
+
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
new file mode 100644
index 000000000..b2e26ebb7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -0,0 +1,858 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+using Eigen::internal::TensorBlockDescriptor;
+using Eigen::internal::TensorExecutor;
+
+// -------------------------------------------------------------------------- //
+// Utility functions to generate random tensors, blocks, and evaluate them.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+// Block offsets and extents allows to construct a TensorSlicingOp corresponding
+// to a TensorBlockDescriptor.
+template <int NumDims>
+struct TensorBlockParams {
+  DSizes<Index, NumDims> offsets;
+  DSizes<Index, NumDims> sizes;
+  TensorBlockDescriptor<NumDims, Index> desc;
+};
+
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
+                                              Index min, Index max) {
+  // Choose random offsets and sizes along all tensor dimensions.
+  DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max));
+  DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max));
+
+  // Make sure that offset + size do not overflow dims.
+  for (int i = 0; i < NumDims; ++i) {
+    offsets[i] = numext::mini(dims[i] - 1, offsets[i]);
+    sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]);
+  }
+
+  Index offset = 0;
+  DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims);
+  for (int i = 0; i < NumDims; ++i) {
+    offset += strides[i] * offsets[i];
+  }
+
+  return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)};
+}
+
+// Generate block with block sizes skewed towards inner dimensions. This type of
+// block is required for evaluating broadcast expressions.
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> SkewedInnerBlock(
+    DSizes<Index, NumDims> dims) {
+  using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
+  BlockMapper block_mapper(dims,
+                           {internal::TensorBlockShapeType::kSkewedInnerDims,
+                            internal::random<size_t>(1, dims.TotalSize()),
+                            {0, 0, 0}});
+
+  Index total_blocks = block_mapper.blockCount();
+  Index block_index = internal::random<Index>(0, total_blocks - 1);
+  auto block = block_mapper.blockDescriptor(block_index);
+  DSizes<Index, NumDims> sizes = block.dimensions();
+
+  auto strides = internal::strides<Layout>(dims);
+  DSizes<Index, NumDims> offsets;
+
+  // Compute offsets for the first block coefficient.
+  Index index = block.offset();
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[0] = index;
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[NumDims - 1] = index;
+  }
+
+  return {offsets, sizes, block};
+}
+
+template <int NumDims>
+static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
+  DSizes<Index, NumDims> offsets;
+  for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+
+  return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
+}
+
+inline Eigen::IndexList<Index, Eigen::type2index<1>> NByOne(Index n) {
+  Eigen::IndexList<Index, Eigen::type2index<1>> ret;
+  ret.set(0, n);
+  return ret;
+}
+inline Eigen::IndexList<Eigen::type2index<1>, Index> OneByM(Index m) {
+  Eigen::IndexList<Eigen::type2index<1>, Index> ret;
+  ret.set(1, m);
+  return ret;
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that block expression evaluation produces the same result as a
+// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
+
+template <typename T, int NumDims, int Layout, typename Expression,
+          typename GenBlockParams>
+static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // Scratch memory allocator for block evaluation.
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  TensorBlockScratch scratch(d);
+
+  // TensorEvaluator is needed to produce tensor blocks of the expression.
+  auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
+  eval.evalSubExprsIfNeeded(nullptr);
+
+  // Choose a random offsets, sizes and TensorBlockDescriptor.
+  TensorBlockParams<NumDims> block_params = gen_block();
+
+  // Evaluate TensorBlock expression into a tensor.
+  Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
+
+  // Dimensions for the potential destination buffer.
+  DSizes<Index, NumDims> dst_dims;
+  if (internal::random<bool>()) {
+    dst_dims = block_params.desc.dimensions();
+  } else {
+    for (int i = 0; i < NumDims; ++i) {
+      Index extent = internal::random<Index>(0, 5);
+      dst_dims[i] = block_params.desc.dimension(i) + extent;
+    }
+  }
+
+  // Maybe use this tensor as a block desc destination.
+  Tensor<T, NumDims, Layout> dst(dst_dims);
+  dst.setZero();
+  if (internal::random<bool>()) {
+    block_params.desc.template AddDestinationBuffer<Layout>(
+        dst.data(), internal::strides<Layout>(dst.dimensions()));
+  }
+
+  const bool root_of_expr = internal::random<bool>();
+  auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr);
+
+  if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
+    // Copy data from destination buffer.
+    if (dimensions_match(dst.dimensions(), block.dimensions())) {
+      block = dst;
+    } else {
+      DSizes<Index, NumDims> offsets;
+      for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+      block = dst.slice(offsets, block.dimensions());
+    }
+
+  } else {
+    // Assign to block from expression.
+    auto b_expr = tensor_block.expr();
+
+    // We explicitly disable vectorization and tiling, to run a simple coefficient
+    // wise assignment loop, because it's very simple and should be correct.
+    using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
+    using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
+                                         internal::TiledEvaluation::Off>;
+    BlockExecutor::run(BlockAssign(block, b_expr), d);
+  }
+
+  // Cleanup temporary buffers owned by a tensor block.
+  tensor_block.cleanup();
+
+  // Compute a Tensor slice corresponding to a Tensor block.
+  Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions());
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(slice, s_expr), d);
+
+  // Tensor block and tensor slice must be the same.
+  for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Identity tensor expression transformation.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_with_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      (lhs.square() + rhs.square()).sqrt(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_broadcast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5);
+
+  DSizes<Index, NumDims> bcasted_dims;
+  for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i];
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return RandomBlock<Layout>(bcasted_dims, 5, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); });
+
+  // Check that desc.destination() memory is not shared between two broadcast
+  // materializations.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast) * input.abs().broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_cast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.template cast<int>().template cast<T>(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_select() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims);
+  Tensor<T, NumDims, Layout> rhs(dims);
+  Tensor<bool, NumDims, Layout> cond(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+  cond.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 20);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_padding() {
+  const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4);
+  DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4);
+  array<std::pair<Index, Index>, NumDims> paddings;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = std::make_pair(pad_before[i], pad_after[i]);
+  }
+
+  // Test squeezing reads from inner dim.
+  if (internal::random<bool>()) {
+    pad_before[inner_dim] = 0;
+    pad_after[inner_dim] = 0;
+    paddings[inner_dim] = std::make_pair(0, 0);
+  }
+
+  DSizes<Index, NumDims> padded_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    padded_dims[i] = dims[i] + pad_before[i] + pad_after[i];
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return FixedSizeBlock(padded_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  // Block buffer forwarding.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  // Block expression assignment.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+}
+
+
+template<typename T, int NumDims>
+struct SimpleTensorGenerator {
+  T operator()(const array<Index, NumDims>& coords) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * coords[i]);
+    }
+    return result;
+  }
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<int NumDims>
+struct SimpleTensorGenerator<bool, NumDims> {
+  bool operator()(const array<Index, NumDims>& coords) const {
+    bool result = false;
+    for (int i = 0; i < NumDims; ++i) {
+      result ^= coords[i];
+    }
+    return result;
+  }
+};
+
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_generator() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  auto generator = SimpleTensorGenerator<T, NumDims>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reverse() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Randomly reverse dimensions.
+  Eigen::DSizes<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 10);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Pick a random slice of an input tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+    break;
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(1, dim);
+  Tensor<T, 2, Layout> rhs(dim, 1);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto reshapeLhs = NByOne(dim);
+  auto reshapeRhs = OneByM(dim);
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      lhs.reshape(reshapeLhs).broadcast(bcastLhs) *
+          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_forced_eval() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(dim, 1);
+  Tensor<T, 2, Layout> rhs(1, dim);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_chipping_of_bcast() {
+  if (Layout != static_cast<int>(RowMajor)) return;
+
+  Index dim0 = internal::random<Index>(1, 10);
+  Index dim1 = internal::random<Index>(1, 10);
+  Index dim2 = internal::random<Index>(1, 10);
+
+  Tensor<T, 3, Layout> input(1, dim1, dim2);
+  input.setRandom();
+
+  Eigen::array<Index, 3> bcast = {{dim0, 1, 1}};
+  DSizes<Index, 2> chipped_dims(dim0, dim2);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return SkewedInnerBlock<Layout, 2>(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return RandomBlock<Layout, 2>(chipped_dims, 1, 5); });
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that assigning block to a Tensor expression produces the same result
+// as an assignment to TensorSliceOp (writing a block is is identical to
+// assigning one tensor to a slice of another tensor).
+
+template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
+          typename Expression, typename GenBlockParams>
+static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
+                                  Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // We use tensor evaluator as a target for block and slice assignments.
+  auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
+
+  // Generate a random block, or choose a block that fits in full expression.
+  TensorBlockParams<NumExprDims> block_params = gen_block();
+
+  // Generate random data of the selected block size.
+  Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
+  block.setRandom();
+
+  // ************************************************************************ //
+  // (1) Assignment from a block.
+
+  // Construct a materialize block from a random generated block tensor.
+  internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
+      internal::TensorBlockKind::kView, block.data(), block.dimensions());
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Use evaluator to write block into a tensor.
+  eval.writeBlock(block_params.desc, blk);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> block_assigned = tensor;
+
+  // ************************************************************************ //
+  // (2) Assignment to a slice
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Assign block to a slice of original expression
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(s_expr, block), d);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> slice_assigned = tensor;
+
+  for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return FixedSizeBlock(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  // Pick a random slice of tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME)           \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME)     \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()));  \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
+  // clang-format off
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle);
+
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast);
+
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
+  // clang-format on
+}
diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
new file mode 100644
index 000000000..52f7dde9b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_io.cpp
@@ -0,0 +1,445 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+// -------------------------------------------------------------------------- //
+// A set of tests for TensorBlockIO: copying data between tensor blocks.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+static internal::TensorBlockShapeType RandomBlockShape() {
+  return internal::random<bool>()
+         ? internal::TensorBlockShapeType::kUniformAllDims
+         : internal::TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                           const array<Index, NumDims>& output_to_input_dim_map,
+                           const array<Index, NumDims>& input_strides,
+                           const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+  using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(dims);
+
+  // Construct a tensor block mapper.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(
+      dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, desc.offset());
+
+      TensorBlockIO::Copy(dst, src);
+    }
+
+    {
+      // Write from block buffer to output.
+      IODst dst(blk_dims, output_strides, output_data, desc.offset());
+      IOSrc src(blk_strides, block_data, 0);
+
+      TensorBlockIO::Copy(dst, src);
+    }
+  }
+
+  for (int i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_using_reordered_dimensions() {
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<int> shuffle;
+
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed));
+
+  DSizes<Index, NumDims> output_tensor_dims;
+  DSizes<Index, NumDims> input_to_output_dim_map;
+  DSizes<Index, NumDims> output_to_input_dim_map;
+  for (Index i = 0; i < NumDims; ++i) {
+    output_tensor_dims[shuffle[i]] = dims[i];
+    input_to_output_dim_map[i] = shuffle[i];
+    output_to_input_dim_map[shuffle[i]] = i;
+  }
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(output_tensor_dims);
+
+  // Construct a tensor block mapper.
+  // NOTE: Tensor block mapper works with shuffled dimensions.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(output_tensor_dims,
+                                 {RandomBlockShape(),
+                                  RandomTargetBlockSize(output_tensor_dims),
+                                  {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(output_tensor_dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (Index i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
+        desc.offset(), output_to_input_dim_map, input_strides,
+        output_strides);
+
+    // NOTE: Block dimensions are in the same order as output dimensions.
+
+    using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+    using IODst = typename TensorBlockIO::Dst;
+    using IOSrc = typename TensorBlockIO::Src;
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, first_coeff_index);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(output_to_input_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+
+    {
+      // We need to convert block dimensions from output to input order.
+      auto dst_dims = blk_dims;
+      for (int out_dim = 0; out_dim < NumDims; ++out_dim) {
+        dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim];
+      }
+
+      // Write from block buffer to output.
+      IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
+      IOSrc src(blk_strides, block_data, 0);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(input_to_output_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+  }
+
+  for (Index i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads along inner dimensions
+// in this case is illegal, because we reorder innermost dimension.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
+  DSizes<Index, 3> tensor_dims(7, 9, 7);
+  DSizes<Index, 3> block_dims = tensor_dims;
+
+  DSizes<int, 3> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 2;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 0;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 3, Layout> block(block_dims);
+  Tensor<float, 3, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        float block_value = block_tensor(d2, d1, d0);
+        float tensor_value = tensor_tensor(d0, d1, d2);
+        VERIFY_IS_EQUAL(block_value, tensor_value);
+      }
+    }
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads in this case is allowed
+// because we reorder outer dimensions.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_squeeze() {
+  DSizes<Index, 4> tensor_dims(7, 5, 9, 9);
+  DSizes<Index, 4> block_dims = tensor_dims;
+
+  DSizes<int, 4> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 0;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 3;
+  block_to_tensor_dim[3] = 2;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 4, Layout> block(block_dims);
+  Tensor<float, 4, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
+          float block_value = block_tensor(d0, d1, d3, d2);
+          float tensor_value = tensor_tensor(d0, d1, d2, d3);
+          VERIFY_IS_EQUAL(block_value, tensor_value);
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_zero_stride() {
+  DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30);
+
+  DSizes<Index, 5> input_tensor_dims = rnd_dims;
+  input_tensor_dims[0] = 1;
+  input_tensor_dims[2] = 1;
+  input_tensor_dims[4] = 1;
+
+  Tensor<float, 5, Layout> input(input_tensor_dims);
+  input.setRandom();
+
+  DSizes<Index, 5> output_tensor_dims = rnd_dims;
+
+  auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims);
+  auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims);
+
+  auto input_tensor_strides_with_zeros = input_tensor_strides;
+  input_tensor_strides_with_zeros[0] = 0;
+  input_tensor_strides_with_zeros[2] = 0;
+  input_tensor_strides_with_zeros[4] = 0;
+
+  Tensor<float, 5, Layout> output(output_tensor_dims);
+  output.setRandom();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Write data from input to output with broadcasting in dims [0, 2, 4].
+  IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0);
+  IOSrc src(input_tensor_strides_with_zeros, input.data(), 0);
+  TensorBlockIO::Copy(dst, src);
+
+  for (int i = 0; i < output_tensor_dims[0]; ++i) {
+    for (int j = 0; j < output_tensor_dims[1]; ++j) {
+      for (int k = 0; k < output_tensor_dims[2]; ++k) {
+        for (int l = 0; l < output_tensor_dims[3]; ++l) {
+          for (int m = 0; m < output_tensor_dims[4]; ++m) {
+            float input_value = input(0, j, 0, l, 0);
+            float output_value = output(i, j, k, l, m);
+            VERIFY_IS_EQUAL(input_value, output_value);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_squeeze_ones() {
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Total size > 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+
+  // Total size == 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+}
+
+#define CALL_SUBTESTS(NAME)                   \
+  CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
+  // clang-format off
+  CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target);
+  CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions);
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_zero_stride<RowMajor>());
+  CALL_SUBTEST(test_block_io_zero_stride<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>());
+  CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>());
+  // clang-format on
+}
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 7201bfe37..20f84b8e0 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -13,8 +13,8 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
@@ -25,50 +25,120 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){
 
   // BROADCAST test:
-  array<int, 4> in_range   = {{2, 3, 5, 7}};
-  array<int, 4> broadcasts = {{2, 3, 1, 4}};
-  array<int, 4> out_range;  // = in_range * broadcasts
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
   for (size_t i = 0; i < out_range.size(); ++i)
     out_range[i] = in_range[i] * broadcasts[i];
 
-  Tensor<float, 4>  input(in_range);
-  Tensor<float, 4> out(out_range);
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
 
   for (size_t i = 0; i < in_range.size(); ++i)
     VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
 
 
-  for (int i = 0; i < input.size(); ++i)
-    input(i) = static_cast<float>(i);
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
 
-  float * gpu_in_data  = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data  = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  TensorMap<Tensor<float, 4>>  gpu_in(gpu_in_data, in_range);
-  TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
-  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
+  TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
   gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
 
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 9; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 28; ++l) {
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
           VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
         }
       }
     }
   }
+  printf("Broadcast Test with fixed size Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
+
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>>  gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
+          VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l));
+        }
+      }
+    }
+  }
   printf("Broadcast Test Passed\n");
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-void test_cxx11_tensor_broadcast_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_broadcast_sycl(sycl_device));
+template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 5c0ea5889..d3dab891f 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
     }
   }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
   tensor.setRandom();
   broadcast = tensor.broadcast(broadcasts);
 
@@ -115,7 +124,7 @@ static void test_static_broadcasting()
   Tensor<float, 3, DataLayout> tensor(8,3,5);
   tensor.setRandom();
 
-#if EIGEN_HAS_CONSTEXPR
+#if defined(EIGEN_HAS_INDEX_LIST)
   Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
 #else
   Eigen::array<int, 3> broadcasts;
@@ -139,7 +148,16 @@ static void test_static_broadcasting()
     }
   }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
   tensor.setRandom();
   broadcast = tensor.broadcast(broadcasts);
 
@@ -180,8 +198,119 @@ static void test_fixed_size_broadcasting()
 #endif
 }
 
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+  Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 9;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 9; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+  Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+  Tensor<float, 3, DataLayout> tensor(1,7,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 13;
+  Tensor<float, 3, DataLayout> broadcasted;
+  broadcasted = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+  Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
 
-void test_cxx11_tensor_broadcasting()
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
 {
   CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
@@ -191,4 +320,12 @@ void test_cxx11_tensor_broadcasting()
   CALL_SUBTEST(test_static_broadcasting<RowMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
new file mode 100644
index 000000000..72cb62fd5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+template <typename T> T square(T x) { return x * x; }
+template <typename T> T cube(T x) { return x * x * x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+}
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
+
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
+  }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
+
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
+  }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
+
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(device);
+    Eigen::SyclDevice sycl_device(&queueInterface);
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index 88c233994..97923d15f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@@ -9,20 +9,17 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -75,8 +72,8 @@ void test_fallback_conversion() {
 }
 
 
-void test_cxx11_tensor_cast_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
 {
-  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_gpu_conversion());
   CALL_SUBTEST(test_fallback_conversion());
 }
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 3c6d0d2ff..45456f3ef 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -8,6 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "random_without_cast_overflow.h"
 
 #include <Eigen/CXX11/Tensor>
 
@@ -104,12 +105,82 @@ static void test_small_to_big_type_cast()
   }
 }
 
+template <typename FromType, typename ToType>
+static void test_type_cast() {
+  Tensor<FromType, 2> ftensor(100, 200);
+  // Generate random values for a valid cast.
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value();
+    }
+  }
+
+  Tensor<ToType, 2> ttensor(100, 200);
+  ttensor = ftensor.template cast<ToType>();
+
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j));
+      VERIFY_IS_APPROX(ttensor(i, j), ref);
+    }
+  }
+}
+
+template<typename Scalar, typename EnableIf = void>
+struct test_cast_runner {
+  static void run() {
+    test_type_cast<Scalar, bool>();
+    test_type_cast<Scalar, int8_t>();
+    test_type_cast<Scalar, int16_t>();
+    test_type_cast<Scalar, int32_t>();
+    test_type_cast<Scalar, int64_t>();
+    test_type_cast<Scalar, uint8_t>();
+    test_type_cast<Scalar, uint16_t>();
+    test_type_cast<Scalar, uint32_t>();
+    test_type_cast<Scalar, uint64_t>();
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, float>();
+    test_type_cast<Scalar, double>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
+// Only certain types allow cast from std::complex<>.
+template<typename Scalar>
+struct test_cast_runner<Scalar, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+  static void run() {
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
 
-void test_cxx11_tensor_casts()
+EIGEN_DECLARE_TEST(cxx11_tensor_casts)
 {
-   CALL_SUBTEST(test_simple_cast());
-   CALL_SUBTEST(test_vectorized_cast());
-   CALL_SUBTEST(test_float_to_int_cast());
-   CALL_SUBTEST(test_big_to_small_type_cast());
-   CALL_SUBTEST(test_small_to_big_type_cast());
+  CALL_SUBTEST(test_simple_cast());
+  CALL_SUBTEST(test_vectorized_cast());
+  CALL_SUBTEST(test_float_to_int_cast());
+  CALL_SUBTEST(test_big_to_small_type_cast());
+  CALL_SUBTEST(test_small_to_big_type_cast());
+
+  CALL_SUBTEST(test_cast_runner<bool>::run());
+  CALL_SUBTEST(test_cast_runner<int8_t>::run());
+  CALL_SUBTEST(test_cast_runner<int16_t>::run());
+  CALL_SUBTEST(test_cast_runner<int32_t>::run());
+  CALL_SUBTEST(test_cast_runner<int64_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint8_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint16_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint32_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint64_t>::run());
+  CALL_SUBTEST(test_cast_runner<half>::run());
+  CALL_SUBTEST(test_cast_runner<bfloat16>::run());
+  CALL_SUBTEST(test_cast_runner<float>::run());
+  CALL_SUBTEST(test_cast_runner<double>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<float>>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<double>>::run());
+
 }
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index 1832dec8b..922274462 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -43,7 +43,7 @@ static void test_simple_chip()
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
   VERIFY_IS_EQUAL(chip2.dimension(3), 11);
   for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
         for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -75,7 +75,7 @@ static void test_simple_chip()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
         }
       }
@@ -126,7 +126,7 @@ static void test_dynamic_chip()
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
   VERIFY_IS_EQUAL(chip2.dimension(3), 11);
   for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
         for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -158,7 +158,7 @@ static void test_dynamic_chip()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
         }
       }
@@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major()
   VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
 }
 
-void test_cxx11_tensor_chipping()
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping)
 {
   CALL_SUBTEST(test_simple_chip<ColMajor>());
   CALL_SUBTEST(test_simple_chip<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
new file mode 100644
index 000000000..1e7093104
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -0,0 +1,623 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
+  tensor.setRandom();
+  tensor1.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+  DataType* gpu_data_tensor1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim3; ++j) {
+      for (int k = 0; k < sizeDim4; ++k) {
+        for (int l = 0; l < sizeDim5; ++l) {
+          float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
+  Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
+  tensor2.setRandom();
+  const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
+  DataType* gpu_data_tensor2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim4; ++j) {
+      for (int k = 0; k < sizeDim5; ++k) {
+        float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
+        VERIFY_IS_EQUAL(chip2(i,j,k), expected);
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_tensor1);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_tensor2);
+  sycl_device.deallocate(gpu_data_chip2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
+  input1.setRandom();
+  input2.setRandom();
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
+  gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
+  input3.setRandom();
+
+  const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
+  DataType* gpu_data_input3  = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
+  gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
+  input4.setRandom();
+
+  const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
+  DataType* gpu_data_input4  = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
+  gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
+  input5.setRandom();
+
+  const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
+  DataType* gpu_data_input5  = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
+  gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
+  input6.setRandom();
+
+  const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
+  DataType* gpu_data_input6  = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
+  gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
+  input7.setRandom();
+
+  DataType* gpu_data_input7  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
+  gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_input1);
+  sycl_device.deallocate(gpu_data_input2);
+  sycl_device.deallocate(gpu_data_input3);
+  sycl_device.deallocate(gpu_data_input4);
+  sycl_device.deallocate(gpu_data_input5);
+  sycl_device.deallocate(gpu_data_input6);
+  sycl_device.deallocate(gpu_data_input7);
+
+}
+
+template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
+  test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index b1ff8aecb..1a18e07cc 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -77,7 +77,7 @@ static void test_equality()
 }
 
 
-void test_cxx11_tensor_comparisons()
+EIGEN_DECLARE_TEST(cxx11_tensor_comparisons)
 {
   CALL_SUBTEST(test_orderings());
   CALL_SUBTEST(test_equality());
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index 2baf5eaad..99447b21d 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
@@ -8,12 +8,9 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -31,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
   cudaMalloc((void**)(&d_in2), complex_bytes);
   cudaMalloc((void**)(&d_out), complex_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -51,11 +48,13 @@ void test_cuda_complex_cwise_ops() {
     Add = 0,
     Sub,
     Mul,
-    Div
+    Div,
+    Neg,
+    NbOps
   };
 
   Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
-  for (int op = Add; op <= Div; op++) {
+  for (int op = Add; op < NbOps; op++) {
     std::complex<T> expected;
     switch (static_cast<CwiseOp>(op)) {
       case Add:
@@ -74,6 +73,12 @@ void test_cuda_complex_cwise_ops() {
         gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
         expected = a / b;
         break;
+      case Neg:
+        gpu_out.device(gpu_device) = -gpu_in1;
+        expected = -a;
+        break;
+      case NbOps:
+        break;
     }
     assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
                            gpu_device.stream()) == cudaSuccess);
@@ -90,7 +95,7 @@ void test_cuda_complex_cwise_ops() {
 }
 
 
-void test_cxx11_tensor_complex_cwise_ops()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
 {
   CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
   CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
index d4e111f5d..f8b8ae704 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu
@@ -8,12 +8,9 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -37,7 +34,7 @@ void test_cuda_nullary() {
   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -73,7 +70,7 @@ void test_cuda_nullary() {
 
 static void test_cuda_sum_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -107,10 +104,45 @@ static void test_cuda_sum_reductions() {
   gpu_device.deallocate(gpu_out_ptr);
 }
 
+static void test_cuda_mean_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.mean();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.mean();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
 
 static void test_cuda_product_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -145,9 +177,10 @@ static void test_cuda_product_reductions() {
 }
 
 
-void test_cxx11_tensor_complex()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
 {
   CALL_SUBTEST(test_cuda_nullary());
   CALL_SUBTEST(test_cuda_sum_reductions());
+  CALL_SUBTEST(test_cuda_mean_reductions());
   CALL_SUBTEST(test_cuda_product_reductions());
 }
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index 03ef12e63..bb9418d33 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -50,7 +50,13 @@ static void test_static_dimension_failure()
       .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
       .concatenate(right, 0);
   Tensor<int, 2, DataLayout> alternative = left
-      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
+   // Clang compiler break with {{{}}} with an ambiguous error on copy constructor
+  // the variadic DSize constructor added for #ifndef EIGEN_EMULATE_CXX11_META_H.
+  // Solution:
+  // either the code should change to 
+  //  Tensor<int, 2>::Dimensions{{2, 3}}
+  // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}}
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0);
 }
 
 template<int DataLayout>
@@ -123,7 +129,7 @@ static void test_concatenation_as_lvalue()
 }
 
 
-void test_cxx11_tensor_concatenation()
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation)
 {
    CALL_SUBTEST(test_dimension_failures<ColMajor>());
    CALL_SUBTEST(test_dimension_failures<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
new file mode 100644
index 000000000..765991b35
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  IndexType leftDim3 = 1;
+  Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  IndexType rightDim3 = 1;
+  Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+
+  //IndexType concatDim1 = 3;
+//	IndexType concatDim2 = 3;
+//	IndexType concatDim3 = 1;
+  //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
+  left.setRandom();
+  right.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  ///
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+  DataType * gpu_out_data1 =  static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+
+  //concatenation = left.concatenate(right, 0);
+  gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
+  sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
+  for (IndexType j = 0; j < 3; ++j) {
+    for (IndexType i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  sycl_device.deallocate(gpu_out_data1);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+  DataType * gpu_out_data2 =  static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+  gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
+  sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data2);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+  DataType * gpu_out_data3 =  static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+  gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
+  sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data3);
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+}
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
+
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
+
+  IndexType concatDim1 = 4;
+  IndexType concatDim2 = 3;
+  Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+  Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
+
+  left.setRandom();
+  right.setRandom();
+  result.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+
+//  t1.concatenate(t2, 0) = result;
+ gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
+ sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(left(i, j), result(i, j));
+      VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+  test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConcat_perDevice<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index ad9c9da39..9d806ee3c 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -55,7 +55,7 @@ static void test_assign_of_const_tensor()
 }
 
 
-void test_cxx11_tensor_const()
+EIGEN_DECLARE_TEST(cxx11_tensor_const)
 {
   CALL_SUBTEST(test_simple_assign());
   CALL_SUBTEST(test_assign_of_const_tensor());
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index dd68430ce..575bdc1f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -10,21 +10,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_gpu_contraction(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
@@ -47,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -68,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -81,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
@@ -111,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -131,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
       !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
     std::cout << "mismatch detected: " << t_result()
@@ -139,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_m() {
+void test_gpu_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(k, 128, 128);
-    test_cuda_contraction<RowMajor>(k, 128, 128);
+    test_gpu_contraction<ColMajor>(k, 128, 128);
+    test_gpu_contraction<RowMajor>(k, 128, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_k() {
+void test_gpu_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, k, 128);
-    test_cuda_contraction<RowMajor>(128, k, 128);
+    test_gpu_contraction<ColMajor>(128, k, 128);
+    test_gpu_contraction<RowMajor>(128, k, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_n() {
+void test_gpu_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, 128, k);
-    test_cuda_contraction<RowMajor>(128, 128, k);
+    test_gpu_contraction<ColMajor>(128, 128, k);
+    test_gpu_contraction<RowMajor>(128, 128, k);
   }
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_sizes() {
+void test_gpu_contraction_sizes() {
   int m_sizes[] = { 31,  39,   63,   64,   65,
                    127, 129,  255,  257 , 511,
                    512, 513, 1023, 1024, 1025};
@@ -188,29 +187,32 @@ void test_cuda_contraction_sizes() {
   for (int i = 0; i < 15; i++) {
     for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+        test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
     }
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
 
   CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
 
-  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
 
-  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
-  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+  CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
 
-  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
-  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+  CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
 
-  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
-  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+  CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif	
 }
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
new file mode 100644
index 000000000..fbcc29358
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -0,0 +1,1026 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <iostream>
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+                                                           128, k);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+  IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
+                         255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType, IndexType>(
+            sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Allocate buffers twice as big to check for invalid read and write
+  auto padded_left_size = 2 * t_left.size();
+  auto padded_right_size = 2 * t_right.size();
+  auto padded_result_size = 2 * t_result.size();
+
+  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  // TensorMaps are still of the same size than the Tensors
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  // Write nan after the actual buffer to propagate nans everywhere in case of
+  // invalid reads
+  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+  auto host_left_data = new DataType[padded_left_size];
+  std::copy_n(t_left.data(), t_left.size(), host_left_data);
+  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+  auto host_right_data = new DataType[padded_right_size];
+  std::copy_n(t_right.data(), t_right.size(), host_right_data);
+  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+  auto host_result_data = new DataType[padded_result_size];
+  std::fill_n(host_result_data, padded_result_size, nan);
+
+  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - host_result_data[i]))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+                                  error_threshold)) {
+      continue;
+    }
+    if (std::isnan(host_result_data[i])) {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", invalid read detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    } else {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    }
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  // Make sure that the rest of the result is still nans
+  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+    if (std::isnan(host_result_data[i])) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", invalid write detected at IndexType " << i << ": "
+              << host_result_data[i] << std::endl;
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+
+  delete[] host_left_data;
+  delete[] host_right_data;
+  delete[] host_result_data;
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+                 IndexType n_size) {
+  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+  // ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 0, DataLayout, IndexType> t_result;
+  Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+      gpu_t_result(d_t_result);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+          t_result() - t_result_gpu()))) > error_threshold &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
+    std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+              << " : mismatch detected: " << t_result() << " vs "
+              << t_result_gpu() << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(), t_result());
+  }
+
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+                       IndexType k_size, IndexType n_size, IndexType m_batch,
+                       IndexType start, IndexType limit) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  typedef Eigen::array<IndexType, 3> TensorDim;
+  typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+  TensorDim left_dims = {{m_batch, k_size, m_size}};
+  TensorDim right_dims = {{m_batch, n_size, k_size}};
+  TensorDim res_dims = {{m_batch, m_size, n_size}};
+  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
+
+  TensorType t_left(left_dims);
+  TensorType t_right(right_dims);
+  TensorType t_result_gpu(res_dims);
+  TensorType t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+  for (int i = start; i < limit; ++i) {
+    auto x = gpu_t_left.template chip<0>(i);
+    auto y = gpu_t_right.template chip<0>(i);
+    auto z = gpu_t_result.template chip<0>(i);
+    z.device(sycl_device) = x.contract(y, contract_pairs);
+  }
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  for (int i = start; i < limit; ++i) {
+    auto x = t_left.template chip<0>(i);
+    auto y = t_right.template chip<0>(i);
+    auto z = t_result.template chip<0>(i);
+    z = x.contract(y, contract_pairs);
+  }
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType j = 0; j < m_size; j++) {
+    for (IndexType i = 0; i < n_size; i++) {
+      if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+              t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+        continue;
+      }
+      if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+                                    error_threshold)) {
+        continue;
+      }
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType m: " << j << " n: " << i
+                << " CPU : " << t_result(j, i)
+                << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+      VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+    }
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+                                 IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Test out of bound for Tensor-Tensor
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       1024);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       4096);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+                                                       2048);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       1024);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+                                                       784);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       10);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+                                                       513);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+                                                       783);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       784);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+                                                       11);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor out of bound tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // VECTOR-VECTOR
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "contracted tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Vector-Tensor
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+                                                       32);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Matrix-Vector
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+                                                       1);
+#endif
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // SCALAR Contraction
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+                                                            8);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+                                                            2048, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            10, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            4096, 1024);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+                                                            16);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                            17);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+                                                            1024, 1024);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            1024, 4096);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 2048);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            1024, 784);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                             17);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                             32);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+                                                             16, 64);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
+  for (const auto &device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+    CALL_SUBTEST_2(tensorTensor(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+    CALL_SUBTEST_3(vectorVector(sycl_device));
+    CALL_SUBTEST_4(vectorTensor(sycl_device));
+    CALL_SUBTEST_5(tensorVector(sycl_device));
+    CALL_SUBTEST_6(tensorScalar(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+    CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+    CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index ace97057f..3b5c6a13c 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -471,7 +471,8 @@ static void test_tensor_product()
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+  Eigen::array<DimPair, 0> dims;
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, dims);
 
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 3);
@@ -510,36 +511,91 @@ static void test_const_inputs()
   VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }
 
-void test_cxx11_tensor_contraction()
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_large_contraction_with_output_kernel() {
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
-  CALL_SUBTEST(test_evals<ColMajor>());
-  CALL_SUBTEST(test_evals<RowMajor>());
-  CALL_SUBTEST(test_scalar<ColMajor>());
-  CALL_SUBTEST(test_scalar<RowMajor>());
-  CALL_SUBTEST(test_multidims<ColMajor>());
-  CALL_SUBTEST(test_multidims<RowMajor>());
-  CALL_SUBTEST(test_holes<ColMajor>());
-  CALL_SUBTEST(test_holes<RowMajor>());
-  CALL_SUBTEST(test_full_redux<ColMajor>());
-  CALL_SUBTEST(test_full_redux<RowMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
-  CALL_SUBTEST(test_expr<ColMajor>());
-  CALL_SUBTEST(test_expr<RowMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
-  CALL_SUBTEST(test_consistency<ColMajor>());
-  CALL_SUBTEST(test_consistency<RowMajor>());
-  CALL_SUBTEST(test_large_contraction<ColMajor>());
-  CALL_SUBTEST(test_large_contraction<RowMajor>());
-  CALL_SUBTEST(test_matrix_vector<ColMajor>());
-  CALL_SUBTEST(test_matrix_vector<RowMajor>());
-  CALL_SUBTEST(test_tensor_vector<ColMajor>());
-  CALL_SUBTEST(test_tensor_vector<RowMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
-  CALL_SUBTEST(test_tensor_product<ColMajor>());
-  CALL_SUBTEST(test_tensor_product<RowMajor>());
-  CALL_SUBTEST(test_const_inputs<ColMajor>());
-  CALL_SUBTEST(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_1(test_evals<ColMajor>());
+  CALL_SUBTEST_1(test_evals<RowMajor>());
+  CALL_SUBTEST_1(test_scalar<ColMajor>());
+  CALL_SUBTEST_1(test_scalar<RowMajor>());
+  CALL_SUBTEST_2(test_multidims<ColMajor>());
+  CALL_SUBTEST_2(test_multidims<RowMajor>());
+  CALL_SUBTEST_2(test_holes<ColMajor>());
+  CALL_SUBTEST_2(test_holes<RowMajor>());
+  CALL_SUBTEST_3(test_full_redux<ColMajor>());
+  CALL_SUBTEST_3(test_full_redux<RowMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST_4(test_expr<ColMajor>());
+  CALL_SUBTEST_4(test_expr<RowMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_consistency<ColMajor>());
+  CALL_SUBTEST_5(test_consistency<RowMajor>());
+  CALL_SUBTEST_5(test_large_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_large_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST_7(test_tensor_product<ColMajor>());
+  CALL_SUBTEST_7(test_tensor_product<RowMajor>());
+  CALL_SUBTEST_8(test_const_inputs<ColMajor>());
+  CALL_SUBTEST_8(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>());
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
 }
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index e3d4675eb..c3688f678 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -25,7 +25,8 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> result(2,3);
   result.setZero();
-  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3;
+  dims3[0] = 0;
 
   typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
   Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
@@ -136,7 +137,7 @@ static void test_strides() {
                                input(12)*kernel(2)));
 }
 
-void test_cxx11_tensor_convolution()
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution)
 {
   CALL_SUBTEST(test_evals<ColMajor>());
   CALL_SUBTEST(test_evals<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
new file mode 100644
index 000000000..3954c8a28
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=55;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 1> kernel_dims = {{4}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 2> dims3{{0,1}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=49;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 1> kernel_dims = {{2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<DataType, 1, DataLayout, IndexType> same(3);
+  std::size_t same_bytes = same.size() * sizeof(DataType);
+  DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+  gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+
+  Tensor<DataType, 1, DataLayout, IndexType> full(5);
+  std::size_t full_bytes = full.size() * sizeof(DataType);
+  DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+  gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_valid);
+  sycl_device.deallocate(d_same);
+  sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+  Eigen::array<IndexType, 1> input_dims = {{13}};
+  Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+  Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 1> dims;
+  dims[0] = 0;
+
+  Eigen::array<IndexType, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<IndexType, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, RowMajor, int64_t>(sycl_device);
+  test_expr<float, ColMajor, int64_t>(sycl_device);
+  test_expr<float, RowMajor, int64_t>(sycl_device);
+  test_modes<float, ColMajor, int64_t>(sycl_device);
+  test_modes<float, RowMajor, int64_t>(sycl_device);
+  test_strides<float, ColMajor, int64_t>(sycl_device);
+  test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConvolutionPerDevice(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
index 4528cc176..b5dbc97bd 100644
--- a/unsupported/test/cxx11_tensor_custom_index.cpp
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -88,7 +88,7 @@ static void test_sizes_as_index()
 }
 
 
-void test_cxx11_tensor_custom_index() {
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) {
   test_map_as_index<ColMajor>();
   test_map_as_index<RowMajor>();
   test_matrix_as_index<ColMajor>();
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 8baa477cc..875ea57d2 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -104,7 +104,7 @@ static void test_custom_binary_op()
 }
 
 
-void test_cxx11_tensor_custom_op()
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op)
 {
   CALL_SUBTEST(test_custom_unary_op());
   CALL_SUBTEST(test_custom_binary_op());
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
new file mode 100644
index 000000000..d947ead83
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -0,0 +1,170 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 5;
+  Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(out.dimension(0), 6);
+  VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), 0);
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input1, const TensorType& input2,
+            Output& output, const Device& device) const
+  {
+    typedef typename TensorType::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int64_t i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+    }
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+  Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+  Eigen::array<IndexType, 3> tensorResultRange  = {{2, 7, 5}};
+
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange1);
+  TensorType gpu_in2(gpu_in2_data, tensorRange2);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+  gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 5; ++i) {
+    typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+    TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(custom_op_perDevice<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index fde20ddf2..c9f78d2d3 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -9,16 +9,15 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 using Eigen::RowMajor;
 
@@ -68,22 +67,22 @@ struct CPUContext {
 // Context for evaluation on GPU
 struct GPUContext {
   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
-    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
     float kernel_1d_val[] = {3.14f, 2.7f};
-    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
-    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
-    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
   }
   ~GPUContext() {
-    assert(cudaFree(kernel_1d_) == cudaSuccess);
-    assert(cudaFree(kernel_2d_) == cudaSuccess);
-    assert(cudaFree(kernel_3d_) == cudaSuccess);
+    assert(gpuFree(kernel_1d_) == gpuSuccess);
+    assert(gpuFree(kernel_2d_) == gpuSuccess);
+    assert(gpuFree(kernel_3d_) == gpuSuccess);
   }
 
   const Eigen::GpuDevice& device() const { return gpu_device_; }
@@ -104,7 +103,7 @@ struct GPUContext {
   float* kernel_2d_;
   float* kernel_3d_;
 
-  Eigen::CudaStreamDevice stream_;
+  Eigen::GpuStreamDevice stream_;
   Eigen::GpuDevice gpu_device_;
 };
 
@@ -283,12 +282,12 @@ void test_gpu() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@@ -296,7 +295,7 @@ void test_gpu() {
 
   GPUContext context(gpu_in1, gpu_in2, gpu_out);
   test_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -306,7 +305,7 @@ void test_gpu() {
   }
 
   test_forced_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -316,7 +315,7 @@ void test_gpu() {
   }
 
   test_compound_assignment(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -326,7 +325,7 @@ void test_gpu() {
   }
 
   test_contraction(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
       const float result = out(i,j,0);
@@ -341,8 +340,8 @@ void test_gpu() {
   }
 
   test_1d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -352,8 +351,8 @@ void test_gpu() {
   }
 
   test_2d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -365,9 +364,13 @@ void test_gpu() {
     }
   }
 
+#if !defined(EIGEN_USE_HIP)
+// disable this test on the HIP platform
+// 3D tensor convolutions seem to hang on the HIP platform
+
   test_3d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 39; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -380,10 +383,13 @@ void test_gpu() {
       }
     }
   }
+
+#endif
+ 
 }
 
 
-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
 {
   CALL_SUBTEST_1(test_cpu());
   CALL_SUBTEST_2(test_gpu());
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 7f79753c5..5095cb078 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -13,19 +13,65 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <stdint.h>
+#include <iostream>
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_memory(const Eigen::SyclDevice &sycl_device) {
+  std::cout << "Running on : "
+            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
+            <<std::endl;
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
+  memset(in1.data(), 1, in1.size() * sizeof(DataType));
+  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
+  for (IndexType i=0; i<in.size(); i++) {
+    VERIFY_IS_EQUAL(in(i), in1(i));
+  }
+  sycl_device.deallocate(gpu_in_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
+  VERIFY(sycl_device.ok());
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorDims = {{sizeDim1}};
+  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
+  sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
 
-void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
-  std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
-    << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
+  out.device(sycl_device) = in / in.constant(0);
+
+  sycl_device.synchronize();
+  VERIFY(!sycl_device.ok());
+  sycl_device.deallocate(gpu_data);
+}
+
+template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+  test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, RowMajor>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, ColMajor>(sycl_device);
 }
-void test_cxx11_tensor_device_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_device_sycl(sycl_device));
+
+EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 16f168ed4..ee416e14a 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -60,10 +60,29 @@ static void test_rank_zero()
   VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
 }
 
-void test_cxx11_tensor_dimension()
+static void test_index_type_promotion() {
+  Eigen::DSizes<int, 3> src0(1, 2, 3);
+  Eigen::array<int, 3> src1;
+  src1[0] = 4;
+  src1[1] = 5;
+  src1[2] = 6;
+
+  Eigen::DSizes<long, 3> dst0(src0);
+  Eigen::DSizes<long, 3> dst1(src1);
+
+  VERIFY_IS_EQUAL(dst0[0], 1L);
+  VERIFY_IS_EQUAL(dst0[1], 2L);
+  VERIFY_IS_EQUAL(dst0[2], 3L);
+  VERIFY_IS_EQUAL(dst1[0], 4L);
+  VERIFY_IS_EQUAL(dst1[1], 5L);
+  VERIFY_IS_EQUAL(dst1[2], 6L);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_dimension)
 {
   CALL_SUBTEST(test_dynamic_size());
   CALL_SUBTEST(test_fixed_size());
   CALL_SUBTEST(test_match());
   CALL_SUBTEST(test_rank_zero());
+  CALL_SUBTEST(test_index_type_promotion());
 }
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
index d7eea42d7..fd889c46c 100644
--- a/unsupported/test/cxx11_tensor_empty.cpp
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor()
 }
 
 
-void test_cxx11_tensor_empty()
+EIGEN_DECLARE_TEST(cxx11_tensor_empty)
 {
    CALL_SUBTEST(test_empty_tensor());
    CALL_SUBTEST(test_empty_fixed_size_tensor());
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 000000000..66b06e8ee
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled evaluation.
+
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor =
+      Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+                                      /*Vectorizable=*/false,
+                                      /*Tiling=*/TiledEvaluation::Off>;
+
+  Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+          typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+                                                   Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+}
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_rvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout, Index> src(dims);
+  src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM)                                           \
+  if (NumDims > (CHIP_DIM)) {                                             \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+    const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> golden;                         \
+    golden = expr;                                                        \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
+                                                                          \
+    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
+    using Executor = internal::TensorExecutor<const Assign, Device,       \
+                                              Vectorizable, Tiling>;      \
+                                                                          \
+    Executor::run(Assign(dst, expr), d);                                  \
+                                                                          \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
+    }                                                                     \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_lvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM)                                             \
+  if (NumDims > (CHIP_DIM)) {                                               \
+    /* Generate random data that we'll assign to the chipped tensor dim. */ \
+    array<Index, NumDims - 1> src_dims;                                     \
+    for (int i = 0; i < NumDims - 1; ++i) {                                 \
+      int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
+      src_dims[i] = dims[dim];                                              \
+    }                                                                       \
+                                                                            \
+    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
+    src.setRandom();                                                        \
+                                                                            \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> random(dims);                         \
+    random.setZero();                                                       \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> golden(dims);                         \
+    golden = random;                                                        \
+    golden.template chip<(CHIP_DIM)>(offset) = src;                         \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> dst(dims);                            \
+    dst = random;                                                           \
+    auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
+                                                                            \
+    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
+    using Executor = internal::TensorExecutor<const Assign, Device,         \
+                                              Vectorizable, Tiling>;        \
+                                                                            \
+    Executor::run(Assign(expr, src), d);                                    \
+                                                                            \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
+    }                                                                       \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) {
+      shuffled_dims[i] = dims[shuffle[i]];
+    }
+
+    const auto expr = src.shuffle(shuffle);
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    DefaultAssign(golden, expr);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_lvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    auto golden_shuffle = golden.shuffle(shuffle);
+    DefaultAssign(golden_shuffle, src);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    auto dst_shuffle = dst.shuffle(shuffle);
+    DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reshape(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+  static constexpr int ReshapedDims = NumDims - 1;
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Multiple 0th dimension and then shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  DSizes<Index, ReshapedDims> reshaped_dims;
+  reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+  for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+  Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.reshape(reshaped_dims);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> golden =
+      src.slice(slice_start, slice_size);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> slice(slice_size);
+  slice.setRandom();
+
+  // Assign a slice using default executor.
+  Tensor<T, NumDims, Options, Index> golden = src;
+  golden.slice(slice_start, slice_size) = slice;
+
+  // And using configured execution strategy.
+  Tensor<T, NumDims, Options, Index> dst = src;
+  auto expr = dst.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(expr, slice), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.square().eval().broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template<typename T, int NumDims>
+struct DummyGenerator {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  T operator()(const array <Index, NumDims>& dims) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * dims[i]);
+    }
+    return result;
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_generator_op(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(20, 30);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto expr = src.generate(DummyGenerator<T, NumDims>());
+
+  // We assume that generator on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+  Tensor <T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Reverse half of the dimensions.
+  Eigen::array<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  const auto expr = src.reverse(reverse);
+
+  // We assume that reversing on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor <T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the reversing using configured tensor executor.
+  Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+  Eigen::DefaultDevice default_device;
+  // Default device is unused in ASYNC tests.
+  EIGEN_UNUSED_VARIABLE(default_device);
+
+  const auto num_threads = internal::random<int>(20, 24);
+  Eigen::ThreadPool tp(num_threads);
+  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
+}
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 77e24cb67..169fc1898 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -7,6 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <numeric>
+
 #include "main.h"
 
 #include <Eigen/CXX11/Tensor>
@@ -193,26 +195,23 @@ static void test_constants()
 
 static void test_boolean()
 {
-  Tensor<int, 1> vec(6);
-  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+  const int kSize = 31;
+  Tensor<int, 1> vec(kSize);
+  std::iota(vec.data(), vec.data() + kSize, 0);
 
   // Test ||.
   Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
-  VERIFY_IS_EQUAL(bool1[0], true);
-  VERIFY_IS_EQUAL(bool1[1], false);
-  VERIFY_IS_EQUAL(bool1[2], false);
-  VERIFY_IS_EQUAL(bool1[3], false);
-  VERIFY_IS_EQUAL(bool1[4], false);
-  VERIFY_IS_EQUAL(bool1[5], true);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = i < 1 || i > 4;
+    VERIFY_IS_EQUAL(bool1[i], expected);
+  }
 
   // Test &&, including cast of operand vec.
   Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
-  VERIFY_IS_EQUAL(bool2[0], false);
-  VERIFY_IS_EQUAL(bool2[1], true);
-  VERIFY_IS_EQUAL(bool2[2], true);
-  VERIFY_IS_EQUAL(bool2[3], true);
-  VERIFY_IS_EQUAL(bool2[4], false);
-  VERIFY_IS_EQUAL(bool2[5], false);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = bool(i) && i < 4;
+    VERIFY_IS_EQUAL(bool2[i], expected);
+  }
 
   // Compilation tests:
   // Test Tensor<bool> against results of cast or comparison; verifies that
@@ -300,8 +299,152 @@ static void test_select()
   }
 }
 
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+  for (int size = 1; size < 17; ++size) {
+    const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN();
+    const Scalar kInf = std::numeric_limits<Scalar>::infinity();
+    const Scalar kZero(0);
+    Tensor<Scalar, 1> vec_all_nan(size);
+    Tensor<Scalar, 1> vec_one_nan(size);
+    Tensor<Scalar, 1> vec_zero(size);
+    vec_all_nan.setConstant(kNaN);
+    vec_zero.setZero();
+    vec_one_nan.setZero();
+    vec_one_nan(size/2) = kNaN;
+
+    auto verify_all_nan = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY((numext::isnan)(v(i)));
+      }
+    };
+
+    auto verify_all_zero = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY_IS_EQUAL(v(i), Scalar(0));
+      }
+    };
+
+    // Test NaN propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = nan
+    // max(0, nan) = nan
+    // max(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero));
+
+    // Test number propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = 0
+    // max(0, nan) = 0
+    // max(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero));
+
+    // Test NaN propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = nan
+    // min(0, nan) = nan
+    // min(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero));
+
+    // Test number propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = 0
+    // min(0, nan) = 0
+    // min(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero));
+
+    // Test min and max reduction
+    Tensor<Scalar, 0> val;
+    val = vec_zero.minimum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.maximum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+
+    // Test NaN propagation for tensor of all NaNs.
+    val = vec_all_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_all_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kInf);
+    val = vec_all_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_all_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), -kInf);
+
+    // Test NaN propagation for tensor with a single NaN.
+    val = vec_one_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? kInf : kZero));
+    val = vec_one_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero));
+  }
+}
+
+static void test_clip()
+{
+  Tensor<float, 1> vec(6);
+  vec(0) = 4.0;
+  vec(1) = 8.0;
+  vec(2) = 15.0;
+  vec(3) = 16.0;
+  vec(4) = 23.0;
+  vec(5) = 42.0;
+
+  float kMin = 20;
+  float kMax = 30;
+
+  Tensor<float, 1> vec_clipped(6);
+  vec_clipped = vec.clip(kMin, kMax);
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax));
+  }
+}
+
+static void test_minmax_nan_propagation()
+{
+  test_minmax_nan_propagation_templ<float>();
+  test_minmax_nan_propagation_templ<double>();
+}
 
-void test_cxx11_tensor_expr()
+EIGEN_DECLARE_TEST(cxx11_tensor_expr)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
@@ -311,4 +454,11 @@ void test_cxx11_tensor_expr()
   CALL_SUBTEST(test_functors());
   CALL_SUBTEST(test_type_casting());
   CALL_SUBTEST(test_select());
+  CALL_SUBTEST(test_clip());
+
+// Nan propagation does currently not work like one would expect from std::max/std::min,
+// so we disable it for now
+#if !EIGEN_ARCH_ARM_OR_ARM64
+  CALL_SUBTEST(test_minmax_nan_propagation());
+#endif
 }
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 2f14ebc62..2e1008eca 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -224,7 +224,35 @@ static void test_fft_real_input_energy() {
   }
 }
 
-void test_cxx11_tensor_fft() {
+template <typename RealScalar>
+static void test_fft_non_power_of_2_round_trip(int exponent) {
+  int n = (1 << exponent) + 1;
+
+  Eigen::DSizes<ptrdiff_t, 1> dimensions;
+  dimensions[0] = n;
+  const DSizes<ptrdiff_t, 1> arr = dimensions;
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> input;
+
+  input.resize(arr);
+  input.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<RealScalar>, 1, ColMajor> forward =
+      input.template fft<BothParts, FFT_FORWARD>(fft);
+
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> output =
+      forward.template fft<RealPart, FFT_REVERSE>(fft);
+
+  for (int i = 0; i < n; ++i) {
+    RealScalar tol = test_precision<RealScalar>() *
+                     (std::abs(input[i]) + std::abs(output[i]) + 1);
+    VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_fft) {
     test_fft_complex_input_golden();
     test_fft_real_input_golden();
 
@@ -270,4 +298,7 @@ void test_cxx11_tensor_fft() {
     test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
     test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
     test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_non_power_of_2_round_trip<float>(7);
+    test_fft_non_power_of_2_round_trip<double>(7);
 }
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 4c660de65..456ce6bea 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -21,7 +21,7 @@ static void test_0d()
   TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
   VERIFY_IS_EQUAL(scalar1.rank(), 0);
   VERIFY_IS_EQUAL(scalar1.size(), 1);
-  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
+  VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1);
 
   scalar1() = 7.0;
   scalar2() = 13.0;
@@ -250,7 +250,7 @@ static void test_array()
   }
 }
 
-void test_cxx11_tensor_fixed_size()
+EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index 45d7345e9..a21a02bec 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -61,7 +61,7 @@ static void test_const()
   Eigen::array<int, 2> bcast;
   bcast[0] = 3;
   bcast[1] = 1;
-  const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
+  const TensorMap<const Tensor<float, 2> > input_tensor(input.data(), 3, 3);
   Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
 
   for (int i = 0; i < 3; ++i) {
@@ -72,7 +72,7 @@ static void test_const()
 }
 
 
-void test_cxx11_tensor_forced_eval()
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval)
 {
   CALL_SUBTEST(test_simple());
   CALL_SUBTEST(test_const());
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 5690da723..a55a5ad8a 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -13,44 +13,44 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
-
+template <typename DataType, int DataLayout, typename IndexType>
 void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 
-  int sizeDim1 = 100;
-  int sizeDim2 = 200;
-  int sizeDim3 = 200;
-  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Eigen::Tensor<float, 3> in1(tensorRange);
-  Eigen::Tensor<float, 3> in2(tensorRange);
-  Eigen::Tensor<float, 3> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 20;
+  IndexType sizeDim3 = 20;
+  Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
 
-  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
 
   // creating TensorMap from tensor
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
-  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
   /// c=(a+b)*b
   gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i, j, k),
                          (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
       }
@@ -63,8 +63,15 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 
 }
 
-void test_cxx11_tensor_forced_eval_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
+template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
index dcb928714..6dcf676bb 100644
--- a/unsupported/test/cxx11_tensor_generator.cpp
+++ b/unsupported/test/cxx11_tensor_generator.cpp
@@ -42,11 +42,11 @@ struct Generator2D {
 template <int DataLayout>
 static void test_2D()
 {
-  Tensor<float, 2> matrix(5, 7);
+  Tensor<float, 2> matrix(512, 512);
   Tensor<float, 2> result = matrix.generate(Generator2D());
 
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 512; ++i) {
+    for (int j = 0; j < 512; ++j) {
       VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
     }
   }
@@ -80,7 +80,7 @@ static void test_gaussian()
 }
 
 
-void test_cxx11_tensor_generator()
+EIGEN_DECLARE_TEST(cxx11_tensor_generator)
 {
   CALL_SUBTEST(test_1D<ColMajor>());
   CALL_SUBTEST(test_1D<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
new file mode 100644
index 000000000..fb6e3d9d0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+static const float error_threshold =1e-8f;
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_1D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 6;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =vec.size()*sizeof(DataType);
+  DataType* gpu_data_vec  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_2D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 7;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 5; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType rows = 32;
+  IndexType cols = 48;
+  array<DataType, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<DataType, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  array<IndexType, 2> tensorRange = {{rows, cols}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < rows; ++i) {
+    for (IndexType j = 0; j < cols; ++j) {
+      DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      DataType gaussian = expf(-g_rows - g_cols);
+      Eigen::internal::isApprox(result(i, j), gaussian, error_threshold);
+    }
+  }
+}
+
+template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 0ba9d52e9..137d0d596 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -9,18 +9,19 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#define EIGEN_GPU_TEST_C99_MATH  EIGEN_HAS_CXX11
+
 using Eigen::Tensor;
 
-void test_cuda_nullary() {
+void test_gpu_nullary() {
   Tensor<float, 1, 0, int> in1(2);
   Tensor<float, 1, 0, int> in2(2);
   in1.setRandom();
@@ -30,12 +31,12 @@ void test_cuda_nullary() {
 
   float* d_in1;
   float* d_in2;
-  cudaMalloc((void**)(&d_in1), tensor_bytes);
-  cudaMalloc((void**)(&d_in2), tensor_bytes);
-  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  gpuMalloc((void**)(&d_in1), tensor_bytes);
+  gpuMalloc((void**)(&d_in2), tensor_bytes);
+  gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -49,23 +50,23 @@ void test_cuda_nullary() {
   Tensor<float, 1, 0, int> new1(2);
   Tensor<float, 1, 0, int> new2(2);
 
-  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
 
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(new1(i), 3.14f);
     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
 }
 
-void test_cuda_elementwise_small() {
+void test_gpu_elementwise_small() {
   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -79,14 +80,14 @@ void test_cuda_elementwise_small() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -98,9 +99,9 @@ void test_cuda_elementwise_small() {
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
@@ -108,12 +109,12 @@ void test_cuda_elementwise_small() {
         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_out);
 }
 
-void test_cuda_elementwise()
+void test_gpu_elementwise()
 {
   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -132,16 +133,16 @@ void test_cuda_elementwise()
   float* d_in2;
   float* d_in3;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_in3), in3_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_in3), in3_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -151,8 +152,8 @@ void test_cuda_elementwise()
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
@@ -162,13 +163,13 @@ void test_cuda_elementwise()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_in3);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_in3);
+  gpuFree(d_out);
 }
 
-void test_cuda_props() {
+void test_gpu_props() {
   Tensor<float, 1> in1(200);
   Tensor<bool, 1> out(200);
   in1.setRandom();
@@ -178,12 +179,12 @@ void test_cuda_props() {
 
   float* d_in1;
   bool* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -193,19 +194,19 @@ void test_cuda_props() {
 
   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 200; ++i) {
     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
-void test_cuda_reduction()
+void test_gpu_reduction()
 {
   Tensor<float, 4> in1(72,53,97,113);
   Tensor<float, 2> out(72,97);
@@ -216,12 +217,12 @@ void test_cuda_reduction()
 
   float* d_in1;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -233,8 +234,8 @@ void test_cuda_reduction()
 
   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -249,12 +250,12 @@ void test_cuda_reduction()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_contraction()
+void test_gpu_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -274,14 +275,14 @@ void test_cuda_contraction()
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -301,7 +302,7 @@ void test_cuda_contraction()
   m_result = m_left * m_right;
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
 
-  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
 
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
@@ -310,13 +311,13 @@ void test_cuda_contraction()
     }
   }
 
-  cudaFree(d_t_left);
-  cudaFree(d_t_right);
-  cudaFree(d_t_result);
+  gpuFree(d_t_left);
+  gpuFree(d_t_right);
+  gpuFree(d_t_result);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_1d()
+void test_gpu_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -331,14 +332,14 @@ void test_cuda_convolution_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -348,8 +349,8 @@ void test_cuda_convolution_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 34; ++j) {
@@ -364,12 +365,12 @@ void test_cuda_convolution_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_col_major_1d()
+void test_gpu_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -384,14 +385,14 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -401,8 +402,8 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 71; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -417,12 +418,12 @@ void test_cuda_convolution_inner_dim_col_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_row_major_1d()
+void test_gpu_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -437,14 +438,14 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -454,8 +455,8 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -470,13 +471,13 @@ void test_cuda_convolution_inner_dim_row_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_2d()
+void test_gpu_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -491,14 +492,14 @@ void test_cuda_convolution_2d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -508,8 +509,8 @@ void test_cuda_convolution_2d()
   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -534,13 +535,13 @@ void test_cuda_convolution_2d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_3d()
+void test_gpu_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -555,14 +556,14 @@ void test_cuda_convolution_3d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;    
+  Eigen::GpuStreamDevice stream;    
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -572,8 +573,8 @@ void test_cuda_convolution_3d()
   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -612,14 +613,15 @@ void test_cuda_convolution_3d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 
+#if EIGEN_GPU_TEST_C99_MATH
 template <typename Scalar>
-void test_cuda_lgamma(const Scalar stddev)
+void test_gpu_lgamma(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -631,12 +633,12 @@ void test_cuda_lgamma(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -644,8 +646,8 @@ void test_cuda_lgamma(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.lgamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -653,12 +655,13 @@ void test_cuda_lgamma(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
+#endif
 
 template <typename Scalar>
-void test_cuda_digamma()
+void test_gpu_digamma()
 {
   Tensor<Scalar, 1> in(7);
   Tensor<Scalar, 1> out(7);
@@ -685,12 +688,12 @@ void test_cuda_digamma()
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
@@ -698,8 +701,8 @@ void test_cuda_digamma()
 
   gpu_out.device(gpu_device) = gpu_in.digamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 5; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
@@ -708,12 +711,12 @@ void test_cuda_digamma()
     VERIFY_IS_EQUAL(out(i), expected_out(i));
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_zeta()
+void test_gpu_zeta()
 {
   Tensor<Scalar, 1> in_x(6);
   Tensor<Scalar, 1> in_q(6);
@@ -747,14 +750,14 @@ void test_cuda_zeta()
   Scalar* d_in_x;
   Scalar* d_in_q;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_q), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_q), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
@@ -763,8 +766,8 @@ void test_cuda_zeta()
 
   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out(0), expected_out(0));
   VERIFY((std::isnan)(out(3)));
@@ -775,13 +778,13 @@ void test_cuda_zeta()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_q);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_q);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_polygamma()
+void test_gpu_polygamma()
 {
   Tensor<Scalar, 1> in_x(7);
   Tensor<Scalar, 1> in_n(7);
@@ -818,14 +821,14 @@ void test_cuda_polygamma()
   Scalar* d_in_x;
   Scalar* d_in_n;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_n), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_n), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
@@ -834,20 +837,20 @@ void test_cuda_polygamma()
 
   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_n);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_n);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igamma()
+void test_gpu_igamma()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -883,14 +886,14 @@ void test_cuda_igamma()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -899,8 +902,8 @@ void test_cuda_igamma()
 
   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -912,13 +915,13 @@ void test_cuda_igamma()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igammac()
+void test_gpu_igammac()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -953,14 +956,14 @@ void test_cuda_igammac()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_a), bytes);
-  cudaMalloc((void**)(&d_x), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -969,8 +972,8 @@ void test_cuda_igammac()
 
   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -982,13 +985,14 @@ void test_cuda_igammac()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
+#if EIGEN_GPU_TEST_C99_MATH
 template <typename Scalar>
-void test_cuda_erf(const Scalar stddev)
+void test_gpu_erf(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -1000,12 +1004,12 @@ void test_cuda_erf(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1013,8 +1017,8 @@ void test_cuda_erf(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erf();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1022,12 +1026,12 @@ void test_cuda_erf(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_erfc(const Scalar stddev)
+void test_gpu_erfc(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -1039,12 +1043,12 @@ void test_cuda_erfc(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1052,8 +1056,8 @@ void test_cuda_erfc(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erfc();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1061,12 +1065,73 @@ void test_cuda_erfc(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+#endif
+template <typename Scalar>
+void test_gpu_ndtri()
+{
+  Tensor<Scalar, 1> in_x(8);
+  Tensor<Scalar, 1> out(8);
+  Tensor<Scalar, 1> expected_out(8);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(0.);
+  in_x(2) = Scalar(0.5);
+  in_x(3) = Scalar(0.2);
+  in_x(4) = Scalar(0.8);
+  in_x(5) = Scalar(0.9);
+  in_x(6) = Scalar(0.1);
+  in_x(7) = Scalar(0.99);
+  in_x(8) = Scalar(0.01);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+  expected_out(2) = Scalar(0.0);
+  expected_out(3) = Scalar(-0.8416212335729142);
+  expected_out(4) = Scalar(0.8416212335729142);
+  expected_out(5) = Scalar(1.2815515655446004);
+  expected_out(6) = Scalar(-1.2815515655446004);
+  expected_out(7) = Scalar(2.3263478740408408);
+  expected_out(8) = Scalar(-2.3263478740408408);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.ndtri();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_betainc()
+void test_gpu_betainc()
 {
   Tensor<Scalar, 1> in_x(125);
   Tensor<Scalar, 1> in_a(125);
@@ -1175,16 +1240,16 @@ void test_cuda_betainc()
   Scalar* d_in_a;
   Scalar* d_in_b;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_a), bytes);
-  cudaMalloc((void**)(&d_in_b), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_a), bytes);
+  gpuMalloc((void**)(&d_in_b), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
@@ -1194,8 +1259,8 @@ void test_cuda_betainc()
 
   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 1; i < 125; ++i) {
     if ((std::isnan)(expected_out(i))) {
@@ -1205,83 +1270,374 @@ void test_cuda_betainc()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_a);
-  cudaFree(d_in_b);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_a);
+  gpuFree(d_in_b);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i0e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
+      0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+      0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+      0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+      0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+      0.0897803118848;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i1e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
+      -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+      -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+      0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+      0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+      0.0875062221833;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
+template <typename Scalar>
+void test_gpu_igamma_der_a()
+{
+  Tensor<Scalar, 1> in_x(30);
+  Tensor<Scalar, 1> in_a(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_a_array(30);
+  Array<Scalar, 1, Dynamic> in_x_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+      1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+      100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
+      -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
+      -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
+      -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
+      -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
+      -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
+      -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
+      -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+  for (int i = 0; i < 30; ++i) {
+    in_x(i) = in_x_array(i);
+    in_a(i) = in_a_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
 
-void test_cxx11_tensor_cuda()
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_gamma_sample_der_alpha()
 {
-  CALL_SUBTEST_1(test_cuda_nullary());
-  CALL_SUBTEST_1(test_cuda_elementwise_small());
-  CALL_SUBTEST_1(test_cuda_elementwise());
-  CALL_SUBTEST_1(test_cuda_props());
-  CALL_SUBTEST_1(test_cuda_reduction());
-  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
+  Tensor<Scalar, 1> in_alpha(30);
+  Tensor<Scalar, 1> in_sample(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_alpha_array(30);
+  Array<Scalar, 1, Dynamic> in_sample_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
+      1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
+      100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+      1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+      0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+      1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+      0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+      1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+      0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+      1.00106492525, 0.97734200649, 1.02198794179;
+
+  for (int i = 0; i < 30; ++i) {
+    in_alpha(i) = in_alpha_array(i);
+    in_sample(i) = in_sample_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_alpha.size() * sizeof(Scalar);
+
+  Scalar* d_alpha;
+  Scalar* d_sample;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_alpha), bytes);
+  gpuMalloc((void**)(&d_sample), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_alpha);
+  gpuFree(d_sample);
+  gpuFree(d_out);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_elementwise_small());
+  CALL_SUBTEST_1(test_gpu_elementwise());
+  CALL_SUBTEST_1(test_gpu_props());
+  CALL_SUBTEST_1(test_gpu_reduction());
+  CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+// they hang..need to investigate and fix
+  CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
+#endif
+
+#if EIGEN_GPU_TEST_C99_MATH
   // std::erf, std::erfc, and so on where only added in c++11. We use them
   // as a golden reference to validate the results produced by Eigen. Therefore
   // we can only run these tests if we use a c++11 compiler.
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
-  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
-  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
-
-  CALL_SUBTEST_5(test_cuda_digamma<float>());
-  CALL_SUBTEST_5(test_cuda_digamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_polygamma<float>());
-  CALL_SUBTEST_5(test_cuda_polygamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_zeta<float>());
-  CALL_SUBTEST_5(test_cuda_zeta<double>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<float>());
-  CALL_SUBTEST_5(test_cuda_igammac<float>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<double>());
-  CALL_SUBTEST_5(test_cuda_igammac<double>());
-
-  CALL_SUBTEST_6(test_cuda_betainc<float>());
-  CALL_SUBTEST_6(test_cuda_betainc<double>());
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
+  // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+
+  CALL_SUBTEST_5(test_gpu_ndtri<float>());
+  CALL_SUBTEST_5(test_gpu_ndtri<double>());
+
+  CALL_SUBTEST_5(test_gpu_digamma<float>());
+  CALL_SUBTEST_5(test_gpu_digamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_polygamma<float>());
+  CALL_SUBTEST_5(test_gpu_polygamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_zeta<float>());
+  CALL_SUBTEST_5(test_gpu_zeta<double>());
+#endif
+
+  CALL_SUBTEST_5(test_gpu_igamma<float>());
+  CALL_SUBTEST_5(test_gpu_igammac<float>());
+
+  CALL_SUBTEST_5(test_gpu_igamma<double>());
+  CALL_SUBTEST_5(test_gpu_igammac<double>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+  CALL_SUBTEST_6(test_gpu_betainc<float>());
+  CALL_SUBTEST_6(test_gpu_betainc<double>());
+
+  CALL_SUBTEST_6(test_gpu_i0e<float>());
+  CALL_SUBTEST_6(test_gpu_i0e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
+
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
+#endif
+
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
index 5fd88fa6c..c20edd9ac 100644
--- a/unsupported/test/cxx11_tensor_ifft.cpp
+++ b/unsupported/test/cxx11_tensor_ifft.cpp
@@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3)
   }
 }
 
-void test_cxx11_tensor_ifft() {
+EIGEN_DECLARE_TEST(cxx11_tensor_ifft) {
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 000000000..db1c0206e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 245;
+  IndexType sizeDim2 = 343;
+  IndexType sizeDim3 = 577;
+
+  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+  typedef Eigen::DSizes<IndexType, 3> Index3;
+  Index3 strides1(1L,1L, 1L);
+  Index3 indicesStart1(1L, 0L, 0L);
+  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+  Index3 strides2(1L,1L, 1L);
+  Index3 indicesStart2(0L, 0L, 0L);
+  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+  tensor1.setRandom();
+  tensor2.setRandom();
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+  for (IndexType i = 0; i <slice_range[0] ; ++i) {
+    for (IndexType j = 0; j < slice_range[1]; ++j) {
+      for (IndexType k = 0; k < slice_range[2]; ++k) {
+        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 475c59651..862f1f7f0 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -405,6 +405,57 @@ void test_patch_padding_same()
   }
 }
 
+// Verifies that SAME padding, when computed as negative values, will be clipped
+// to zero.
+void test_patch_padding_same_negative_padding_clip_to_zero() {
+  int input_depth = 1;
+  int input_rows = 15;
+  int input_cols = 1;
+  int input_batches = 1;
+  int ksize = 1;  // Corresponds to the Rows and Cols for
+                  // tensor.extract_image_patches<>.
+  int row_stride = 5;
+  int col_stride = 1;
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(
+      ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME);
+  // row padding will be computed as -2 originally and then be clipped to 0.
+  VERIFY_IS_EQUAL(result.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);    // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);          // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);          // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 3);              // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major =
+      tensor_row_major.extract_image_patches(ksize, ksize, row_stride,
+                                             col_stride, 1, 1, PADDING_SAME);
+  VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+}
+
 void test_patch_no_extra_dim()
 {
   Tensor<float, 3> tensor(2,3,5);
@@ -746,7 +797,7 @@ void test_imagenet_patches()
   }
 }
 
-void test_cxx11_tensor_image_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch)
 {
   CALL_SUBTEST_1(test_simple_patch());
   CALL_SUBTEST_2(test_patch_no_extra_dim());
@@ -754,4 +805,5 @@ void test_cxx11_tensor_image_patch()
   CALL_SUBTEST_4(test_patch_padding_valid_same_value());
   CALL_SUBTEST_5(test_patch_padding_same());
   CALL_SUBTEST_6(test_imagenet_patches());
+  CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero());
 }
diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
new file mode 100644
index 000000000..c1828a0ec
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
@@ -0,0 +1,1092 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index colmajor " << i << " : "
+           << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index row major" << i << " : "
+           << tensor_row_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+              }
+              if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+
+}
+
+
+// Verifies VALID padding (no padding) with incrementing values.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 3;
+  IndexType input_cols = 3;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+  // ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_result_col_major);
+  sycl_device.deallocate(gpu_data_result_row_major);
+}
+
+// Verifies VALID padding (no padding) with the same value.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 1;
+  IndexType input_rows = 5;
+  IndexType input_cols = 5;
+  IndexType input_batches = 2;
+  IndexType ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+  // ColMajor
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+  gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f);
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 4;
+  IndexType input_cols = 2;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  // ColMajor
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+
+array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}};
+Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template <typename DataType, typename IndexType>
+static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+
+  // ColMajor
+  array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  tensor_col_major.setRandom();
+  Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0));
+
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor_col_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected_col_major = tensor_col_major(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            IndexType row_offset = r*stride + i - row_padding;
+            IndexType col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+              expected_col_major = tensor_col_major(d, row_offset, col_offset);
+            }
+            if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 128;
+  IndexType sizeDim3 = 128;
+  IndexType sizeDim4 = 16;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange);
+  l_in_col_major.setRandom();
+
+  DataType* gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+
+  array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange);
+  size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange);
+  gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4);
+
+  // RowMajor
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1);
+
+  for (IndexType b = 0; b < 16; ++b) {
+    for (IndexType i = 0; i < 128; ++i) {
+      for (IndexType j = 0; j < 128; ++j) {
+        IndexType patchId = i+128*j;
+        for (IndexType c = 0; c < 11; ++c) {
+          for (IndexType r = 0; r < 11; ++r) {
+            for (IndexType d = 0; d < 3; ++d) {
+              DataType expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in_col_major(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 16;
+  sizeDim2 = 64;
+  sizeDim3 = 64;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+// RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 64; ++i) {
+      for (IndexType j = 0; j < 64; ++j) {
+        IndexType patchId = i+64*j;
+        for (IndexType c = 0; c < 9; ++c) {
+          for (IndexType r = 0; r < 9; ++r) {
+            for (IndexType d = 0; d < 16; ++d) {
+              DataType expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in_col_major(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 32;
+  sizeDim2 = 16;
+  sizeDim3 = 16;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 16; ++i) {
+      for (IndexType j = 0; j < 16; ++j) {
+        IndexType patchId = i+16*j;
+        for (IndexType c = 0; c < 7; ++c) {
+          for (IndexType r = 0; r < 7; ++r) {
+            for (IndexType d = 0; d < 32; ++d) {
+              DataType expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in_col_major(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 64;
+  sizeDim2 = 13;
+  sizeDim3 = 13;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 13; ++i) {
+      for (IndexType j = 0; j < 13; ++j) {
+        IndexType patchId = i+13*j;
+        for (IndexType c = 0; c < 3; ++c) {
+          for (IndexType r = 0; r < 3; ++r) {
+            for (IndexType d = 0; d < 64; ++d) {
+              DataType expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in_col_major(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+test_simple_image_patch_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
+test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
+test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index 4cf5df666..2166532c8 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -22,9 +22,9 @@ static void test_static_index_list()
   VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
 
   EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -167,19 +167,18 @@ static void test_type2indexpair_list()
 
   typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
   typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
-  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
-  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<Index>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<Index>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<Index>> Dims2_c;
 
-  Dims0 d0;
   Dims2_a d2_a;
 
   Dims2_b d2_b;
-  d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+  d2_b.set(1, Eigen::IndexPair<Index>(1,11));
 
   Dims2_c d2_c;
-  d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
-  d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11));  // setting type2indexpair to correct value.
-  d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+  d2_c.set(0, Eigen::IndexPair<Index>(Eigen::IndexPair<Index>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<Index>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<Index>(2,12));
 
   VERIFY_IS_EQUAL(d2_a[0].first, 0);
   VERIFY_IS_EQUAL(d2_a[0].second, 10);
@@ -278,9 +277,9 @@ static void test_dynamic_index_list()
   VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 0);
 
   Tensor<float, 1> result = tensor.sum(reduction_axis);
   for (int i = 0; i < result.size(); ++i) {
@@ -310,10 +309,10 @@ static void test_mixed_index_list()
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
   VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[3]), 3);
 
   typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
   ReductionIndices reduction_indices;
@@ -373,7 +372,7 @@ static void test_dim_check()
 
 #endif
 
-void test_cxx11_tensor_index_list()
+EIGEN_DECLARE_TEST(cxx11_tensor_index_list)
 {
 #ifdef EIGEN_HAS_INDEX_LIST
   CALL_SUBTEST(test_static_index_list());
diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
index 4997935e9..75089e856 100644
--- a/unsupported/test/cxx11_tensor_inflation.cpp
+++ b/unsupported/test/cxx11_tensor_inflation.cpp
@@ -74,7 +74,7 @@ static void test_simple_inflation()
   }
 }
 
-void test_cxx11_tensor_inflation()
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation)
 {
   CALL_SUBTEST(test_simple_inflation<ColMajor>());
   CALL_SUBTEST(test_simple_inflation<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
new file mode 100644
index 000000000..521ae0cc3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+// Inflation Definition for each dimension the inflated val would be
+//((dim-1)*strid[dim] +1)
+
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// tensor of size (2*3) +1 = 7 with the value of
+// (4, 0, 0, 4, 0, 0, 4).
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) {
+
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_stride  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  IndexType inflatedSizeDim1 = 3;
+  IndexType inflatedSizeDim2 = 9;
+  IndexType inflatedSizeDim3 = 9;
+  IndexType inflatedSizeDim4 = 19;
+  array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange);
+
+  const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType);
+  DataType* gpu_data_inflated  = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange);
+  gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1);
+  VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2);
+  VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3);
+  VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4);
+
+  for (IndexType i = 0; i < inflatedSizeDim1; ++i) {
+    for (IndexType j = 0; j < inflatedSizeDim2; ++j) {
+      for (IndexType k = 0; k < inflatedSizeDim3; ++k) {
+        for (IndexType l = 0; l < inflatedSizeDim4; ++l) {
+          if (i % strides[0] == 0 &&
+              j % strides[1] == 0 &&
+              k % strides[2] == 0 &&
+              l % strides[3] == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3]));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_stride);
+  sycl_device.deallocate(gpu_data_inflated);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 8e2b70b75..d18a05ec4 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -135,7 +135,7 @@ void test_specific() {
   VERIFY_IS_EQUAL(result, result_op);
 }
 
-void test_cxx11_tensor_intdiv()
+EIGEN_DECLARE_TEST(cxx11_tensor_intdiv)
 {
   CALL_SUBTEST_1(test_signed_32bit());
   CALL_SUBTEST_2(test_unsigned_32bit());
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 489960529..2c638f9bf 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -119,7 +119,7 @@ static void test_output_const()
 }
 
 
-void test_cxx11_tensor_io()
+EIGEN_DECLARE_TEST(cxx11_tensor_io)
 {
   CALL_SUBTEST(test_output_0d<ColMajor>());
   CALL_SUBTEST(test_output_0d<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
index ae297a9da..efb333360 100644
--- a/unsupported/test/cxx11_tensor_layout_swap.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -54,7 +54,7 @@ static void test_swap_as_lvalue()
 }
 
 
-void test_cxx11_tensor_layout_swap()
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap)
 {
   CALL_SUBTEST(test_simple_swap());
   CALL_SUBTEST(test_swap_as_lvalue());
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
new file mode 100644
index 000000000..9546b911c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  //tensor.setRandom();
+
+//  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, typename IndexType>
+static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.swap_layout().device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+//  tensor.setRandom();
+
+  //Tensor<float, 3, RowMajor> tensor2(7,3,2);
+//  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_swap_sycl<DataType, int64_t>(sycl_device);
+  test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
index 071f5b406..6ba9a212d 100644
--- a/unsupported/test/cxx11_tensor_lvalue.cpp
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -36,7 +36,7 @@ static void test_compound_assignment()
 }
 
 
-void test_cxx11_tensor_lvalue()
+EIGEN_DECLARE_TEST(cxx11_tensor_lvalue)
 {
   CALL_SUBTEST(test_compound_assignment());
 }
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 3db0ee7c0..4d4f68911 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -19,8 +19,8 @@ static void test_0d()
   Tensor<int, 0> scalar1;
   Tensor<int, 0, RowMajor> scalar2;
 
-  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
-  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+  TensorMap<const Tensor<int, 0> > scalar3(scalar1.data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
 
   scalar1() = 7;
   scalar2() = 13;
@@ -37,8 +37,8 @@ static void test_1d()
   Tensor<int, 1> vec1(6);
   Tensor<int, 1, RowMajor> vec2(6);
 
-  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
-  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+  TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6);
+  TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6);
 
   vec1(0) = 4;  vec2(0) = 0;
   vec1(1) = 8;  vec2(1) = 1;
@@ -85,8 +85,8 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
-  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
-  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+  TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
 
   VERIFY_IS_EQUAL(mat3.rank(), 2);
   VERIFY_IS_EQUAL(mat3.size(), 6);
@@ -129,8 +129,8 @@ static void test_3d()
     }
   }
 
-  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -265,7 +265,54 @@ static void test_casting()
   VERIFY_IS_EQUAL(sum1, 861);
 }
 
-void test_cxx11_tensor_map()
+template<typename T>
+static const T& add_const(T& value) {
+  return value;
+}
+
+static void test_0d_const_tensor()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_0d_const_tensor_map()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  const TensorMap<Tensor<int, 0> > scalar3(scalar1.data());
+  const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+  // Although TensorMap is constant, we still can write to the underlying
+  // storage, because we map over non-constant Tensor.
+  scalar3() = 7;
+  scalar4() = 13;
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+
+  // Pointer to the underlying storage is also non-const.
+  scalar3.data()[0] = 8;
+  scalar4.data()[0] = 14;
+
+  VERIFY_IS_EQUAL(scalar1(), 8);
+  VERIFY_IS_EQUAL(scalar2(), 14);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_map)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
@@ -274,4 +321,7 @@ void test_cxx11_tensor_map()
 
   CALL_SUBTEST(test_from_tensor());
   CALL_SUBTEST(test_casting());
+
+  CALL_SUBTEST(test_0d_const_tensor());
+  CALL_SUBTEST(test_0d_const_tensor_map());
 }
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index 61c742a16..82a1a26d8 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -39,7 +39,7 @@ static void test_sigmoid()
 }
 
 
-void test_cxx11_tensor_math()
+EIGEN_DECLARE_TEST(cxx11_tensor_math)
 {
   CALL_SUBTEST(test_tanh());
   CALL_SUBTEST(test_sigmoid());
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 000000000..029653e27
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.tanh();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.tanh();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.sigmoid();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.sigmoid();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 4fba6fdd1..ee2616fd7 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -47,7 +47,7 @@ static void test_simple()
 }
 
 
-void test_cxx11_tensor_mixed_indices()
+EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices)
 {
   CALL_SUBTEST(test_simple());
 }
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index f7de43110..ed5d5ade3 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -41,7 +41,29 @@ static void test_simple_reshape()
   }
 }
 
-template<typename>
+template <typename>
+static void test_static_reshape() {
+#if defined(EIGEN_HAS_INDEX_LIST)
+  using Eigen::type2index;
+
+  Tensor<float, 5> tensor(2, 3, 1, 7, 1);
+  tensor.setRandom();
+
+  // New dimensions: [2, 3, 7]
+  Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
+  Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<ptrdiff_t,3>>(dim));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i, j, 0, k, 0), reshaped(i, j, k));
+      }
+    }
+  }
+#endif
+}
+
+template <typename>
 static void test_reshape_in_expr() {
   MatrixXf m1(2,3*5*7*11);
   MatrixXf m2(3*5*7*11,13);
@@ -90,19 +112,19 @@ static void test_reshape_as_lvalue()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_simple_slice()
 {
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Tensor<T, 5, DataLayout> slice1(1,1,1,1,1);
   Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
   slice1 = tensor.slice(indices, sizes);
   VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
 
-  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Tensor<T, 5, DataLayout> slice2(1,1,2,2,3);
   Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
   slice2 = tensor.slice(indices2, sizes2);
@@ -115,20 +137,20 @@ static void test_simple_slice()
   }
 }
 
-template<typename=void>
+template<typename T>
 static void test_const_slice()
 {
-  const float b[1] = {42};
-  TensorMap<Tensor<const float, 1> > m(b, 1);
+  const T b[1] = {42};
+  TensorMap<Tensor<const T, 1> > m(b, 1);
   DSizes<DenseIndex, 1> offsets;
   offsets[0] = 0;
-  TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions()));
   VERIFY_IS_EQUAL(slice_ref(0), 42);
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_in_expr() {
-  typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+  typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx;
   Mtx m1(7,7);
   Mtx m2(3,3);
   m1.setRandom();
@@ -136,10 +158,10 @@ static void test_slice_in_expr() {
 
   Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
 
-  TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
-  TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
-  Tensor<float, 2, DataLayout> tensor3(3,1);
-  typedef Tensor<float, 1>::DimensionPair DimPair;
+  TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  Tensor<T, 2, DataLayout> tensor3(3,1);
+  typedef typename Tensor<T, 1>::DimensionPair DimPair;
   array<DimPair, 1> contract_along{{DimPair(1, 0)}};
 
   Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
@@ -156,28 +178,28 @@ static void test_slice_in_expr() {
   }
 
   // Take an arbitrary slice of an arbitrarily sized tensor.
-  TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
-  Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
   for (int i = 0; i < 35; ++i) {
     VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_as_lvalue()
 {
-  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  Tensor<T, 3, DataLayout> tensor1(2,2,7);
   tensor1.setRandom();
-  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  Tensor<T, 3, DataLayout> tensor2(2,2,7);
   tensor2.setRandom();
-  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  Tensor<T, 3, DataLayout> tensor3(4,3,5);
   tensor3.setRandom();
-  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  Tensor<T, 3, DataLayout> tensor4(4,3,2);
   tensor4.setRandom();
-  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  Tensor<T, 3, DataLayout> tensor5(10,13,12);
   tensor5.setRandom();
 
-  Tensor<float, 3, DataLayout> result(4,5,7);
+  Tensor<T, 3, DataLayout> result(4,5,7);
   Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
   Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
   result.slice(first_slice, sizes12) = tensor1;
@@ -223,10 +245,10 @@ static void test_slice_as_lvalue()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_raw_data()
 {
-  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  Tensor<T, 4, DataLayout> tensor(3,5,7,11);
   tensor.setRandom();
 
   Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
@@ -253,7 +275,7 @@ static void test_slice_raw_data()
   extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
   auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
   VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
-  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0));
 
   if (DataLayout == ColMajor) {
     offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
@@ -318,15 +340,15 @@ static void test_slice_raw_data()
 }
 
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice()
 {
-  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Tensor<T, 5, DataLayout> Tensor5f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
-  Tensor<float, 2, DataLayout> tensor2(7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 2, DataLayout> tensor2(7,11);
   tensor.setRandom();
   tensor2.setRandom();
 
@@ -412,13 +434,13 @@ static void test_strided_slice()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice_write()
 {
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
 
-  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11);
   tensor.setRandom();
   tensor2=tensor;
   Tensor2f slice(2,3);
@@ -438,15 +460,14 @@ static void test_strided_slice_write()
   }
 }
 
-
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_composition()
 {
-  Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+  Eigen::Tensor<T, 2, DataLayout> matrix(7, 11);
   matrix.setRandom();
 
   const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
-  Eigen::Tensor<float, 3, DataLayout> tensor =
+  Eigen::Tensor<T, 3, DataLayout> tensor =
       matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
 
   VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
@@ -458,28 +479,87 @@ static void test_composition()
   }
 }
 
+template<typename T, int DataLayout>
+static void test_empty_slice()
+{
+  Tensor<T, 3, DataLayout> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<T, 3, DataLayout> copy = tensor;
+
+  // empty size in first dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices1(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes1(0,1,2);
+  Tensor<T, 3, DataLayout> slice1(0,1,2);
+  slice1.setRandom();
+  tensor.slice(indices1, sizes1) = slice1;
+
+  // empty size in second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices2(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes2(1,0,2);
+  Tensor<T, 3, DataLayout> slice2(1,0,2);
+  slice2.setRandom();
+  tensor.slice(indices2, sizes2) = slice2;
+
+  // empty size in third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices3(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(1,1,0);
+  Tensor<T, 3, DataLayout> slice3(1,1,0);
+  slice3.setRandom();
+  tensor.slice(indices3, sizes3) = slice3;
+
+  // empty size in first and second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices4(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(0,0,2);
+  Tensor<T, 3, DataLayout> slice4(0,0,2);
+  slice4.setRandom();
+  tensor.slice(indices4, sizes4) = slice4;
+
+  // empty size in second and third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices5(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(1,0,0);
+  Tensor<T, 3, DataLayout> slice5(1,0,0);
+  slice5.setRandom();
+  tensor.slice(indices5, sizes5) = slice5;
+
+  // empty size in all dimensions
+  Eigen::DSizes<ptrdiff_t, 3> indices6(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes6(0,0,0);
+  Tensor<T, 3, DataLayout> slice6(0,0,0);
+  slice6.setRandom();
+  tensor.slice(indices6, sizes6) = slice6;
+
+  // none of these operations should change the tensor's components
+  // because all of the rvalue slices have at least one zero dimension
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+          VERIFY_IS_EQUAL(tensor(i,j,k), copy(i,j,k));
+      }
+    }
+  }
+}
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>()))
 
-void test_cxx11_tensor_morphing()
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
 {
   CALL_SUBTEST_1(test_simple_reshape<void>());
-  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_static_reshape<void>());
   CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
-
-  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
-  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
-  CALL_SUBTEST_1(test_const_slice());
-  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
-  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
-  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
-  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
-  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
-  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
-
-  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
-  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
-  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
-  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
-
-  CALL_SUBTEST_7(test_composition<ColMajor>());
-  CALL_SUBTEST_7(test_composition<RowMajor>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_const_slice<float>());
+
+  CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue);
+  CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data);
+  CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write);
+  CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition);
 }
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
new file mode 100644
index 000000000..bf001b40f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+  typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
+
+  Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
+
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+  DataType* gpu_data4  = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+  TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+
+  gpu2.device(sycl_device)=gpu1.reshape(dim2);
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType));
+
+  gpu3.device(sycl_device)=gpu1.reshape(dim3);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
+  sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      ///ColMajor
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));    ///ColMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));    ///ColMajor
+        }
+        else{
+          //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k));   /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+  sycl_device.deallocate(gpu_data4);
+}
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+  typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+  typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
+
+  tensor.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
+
+  TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+  TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+  TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+
+  gpu2.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType));
+
+  gpu3.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
+
+
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));    ///ColMajor
+        }
+        else{
+          VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  typedef Eigen::DSizes<IndexType, 5> Index5;
+  Index5 strides(1L,1L,1L,1L,1L);
+  Index5 indicesStart(1L,2L,3L,4L,5L);
+  Index5 indicesStop(2L,3L,4L,5L,6L);
+  Index5 lengths(1L,1L,1L,1L,1L);
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  Index5 strides2(1L,1L,1L,1L,1L);
+  Index5 indicesStart2(1L,1L,3L,4L,5L);
+  Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f;
+  typedef Eigen::DSizes<IndexType, 2> Index2;
+  IndexType sizeDim1 = 7L;
+  IndexType sizeDim2 = 11L;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange);
+  IndexType sliceDim1 = 2;
+  IndexType sliceDim2 = 3;
+  array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}};
+  Tensor2f slice(sliceRange);
+  Index2 strides(1L,1L);
+  Index2 indicesStart(3L,4L);
+  Index2 indicesStop(5L,7L);
+  Index2 lengths(2L,3L);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange);
+
+
+  tensor.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1;
+
+  slice.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType));
+
+
+  gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
+  gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+  for(IndexType i=0;i<sizeDim1;i++)
+    for(IndexType j=0;j<sizeDim2;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::array<OutIndex, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+  using TensorMI64 = TensorMap<TensorI64>;
+  using TensorMI32 = TensorMap<TensorI32>;
+  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+  TensorI64 out_tensor_gpu(tensor_range);
+  TensorI64 out_tensor_cpu(tensor_range);
+  out_tensor_cpu.setRandom();
+
+  TensorI64 sub_tensor(slice_range);
+  sub_tensor.setRandom();
+
+  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+  TensorMI64 out_gpu(out_gpu_data, tensor_range);
+  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+  int has_err = 0;
+  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+    auto exp = out_tensor_cpu(i);
+    auto val = out_tensor_gpu(i);
+    if (val != exp) {
+      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+      has_err = 1;
+    }
+  }
+  sycl_device.deallocate(out_gpu_data);
+  sycl_device.deallocate(sub_gpu_data);
+  return has_err;
+}
+
+template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  run_eigen<float, RowMajor, long, int>(sycl_device); 
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp
new file mode 100644
index 000000000..a2982319f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_move.cpp
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <utility>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void calc_indices(int i, int& x, int& y, int& z)
+{
+  x = i / 4;
+  y = (i % 4) / 2;
+  z = i % 2;
+}
+
+static void test_move()
+{
+  int x;
+  int y;
+  int z;
+
+  Tensor<int,3> tensor1(2, 2, 2);
+  Tensor<int,3,RowMajor> tensor2(2, 2, 2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    tensor1(x,y,z) = i;
+    tensor2(x,y,z) = 2 * i;
+  }
+
+  // Invokes the move constructor.
+  Tensor<int,3> moved_tensor1 = std::move(tensor1);
+  Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2);
+
+  VERIFY_IS_EQUAL(tensor1.size(), 0);
+  VERIFY_IS_EQUAL(tensor2.size(), 0);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i);
+  }
+
+  Tensor<int,3> moved_tensor3(2,2,2);
+  Tensor<int,3,RowMajor> moved_tensor4(2,2,2);
+
+  moved_tensor3.setZero();
+  moved_tensor4.setZero();
+
+  // Invokes the move assignment operator.
+  moved_tensor3 = std::move(moved_tensor1);
+  moved_tensor4 = std::move(moved_tensor2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_move)
+{
+  CALL_SUBTEST(test_move());
+}
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index c946007b8..8e8165302 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -9,38 +9,21 @@
 
 #define EIGEN_USE_THREADS
 
+#include <atomic>
+
 #include <stdlib.h>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#include <windows.h>
-void sleep(int seconds) {
-  Sleep(seconds*1000);
-}
-#else
-#include <unistd.h>
-#endif
-
-
-namespace {
-
-void WaitAndAdd(Eigen::Notification* n, int* counter) {
-  n->Wait();
-  *counter = *counter + 1;
-}
-
-}  // namespace
-
 static void test_notification_single()
 {
   ThreadPool thread_pool(1);
 
-  int counter = 0;
+  std::atomic<int> counter(0);
   Eigen::Notification n;
-  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
   thread_pool.Schedule(func);
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
   // The thread should be waiting for the notification.
   VERIFY_IS_EQUAL(counter, 0);
@@ -48,7 +31,7 @@ static void test_notification_single()
   // Unblock the thread
   n.Notify();
 
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
   // Verify the counter has been incremented
   VERIFY_IS_EQUAL(counter, 1);
@@ -60,21 +43,21 @@ static void test_notification_multiple()
 {
   ThreadPool thread_pool(1);
 
-  int counter = 0;
+  std::atomic<int> counter(0);
   Eigen::Notification n;
-  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   VERIFY_IS_EQUAL(counter, 0);
   n.Notify();
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   VERIFY_IS_EQUAL(counter, 4);
 }
 
-void test_cxx11_tensor_notification()
+EIGEN_DECLARE_TEST(cxx11_tensor_notification)
 {
   CALL_SUBTEST(test_notification_single());
   CALL_SUBTEST(test_notification_multiple());
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index e9d1b2d3c..99e18076a 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -94,7 +94,7 @@ static void test_contractions()
 }
 
 
-void test_cxx11_tensor_of_complex()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_complex)
 {
   CALL_SUBTEST(test_additions());
   CALL_SUBTEST(test_abs());
diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
index f179a0c21..344d678ef 100644
--- a/unsupported/test/cxx11_tensor_of_const_values.cpp
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -97,7 +97,7 @@ static void test_plus_equal()
 }
 
 
-void test_cxx11_tensor_of_const_values()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values)
 {
   CALL_SUBTEST(test_assign());
   CALL_SUBTEST(test_plus());
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 2f86980a2..30bcc1d28 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -9,21 +9,19 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+
 using Eigen::Tensor;
 
 template<typename>
-void test_cuda_numext() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_numext() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -59,14 +57,14 @@ void test_cuda_numext() {
 }
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_GPU_FP16
 
 template<typename>
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
-  
+
   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
@@ -97,8 +95,8 @@ void test_cuda_conversion() {
 }
 
 template<typename>
-void test_cuda_unary() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_unary() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -134,8 +132,8 @@ void test_cuda_unary() {
 }
 
 template<typename>
-void test_cuda_elementwise() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_elementwise() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -176,8 +174,8 @@ void test_cuda_elementwise() {
 }
 
 template<typename>
-void test_cuda_trancendental() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_trancendental() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -200,6 +198,8 @@ void test_cuda_trancendental() {
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
@@ -207,6 +207,7 @@ void test_cuda_trancendental() {
   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
   gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
 
   gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
   gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
@@ -217,6 +218,9 @@ void test_cuda_trancendental() {
   gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
   gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
 
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
   Tensor<float, 1> input1(num_elem);
   Tensor<Eigen::half, 1> half_prec1(num_elem);
   Tensor<Eigen::half, 1> full_prec1(num_elem);
@@ -243,7 +247,7 @@ void test_cuda_trancendental() {
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
     else
       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
@@ -264,8 +268,8 @@ void test_cuda_trancendental() {
 }
 
 template<typename>
-void test_cuda_contractions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_contractions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int rows = 23;
   int cols = 23;
@@ -315,36 +319,32 @@ void test_cuda_contractions() {
 }
 
 template<typename>
-void test_cuda_reductions(int size1, int size2, int redux) {
+void test_gpu_reductions(int size1, int size2, int redux) {
 
    std::cout << "Reducing " << size1 << " by " << size2
-             << " tensor along dim " << redux << std::endl; 
+             << " tensor along dim " << redux << std::endl;
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = size1*size2;
   int result_size = (redux == 1 ? size1 : size2);
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
       d_res_half, result_size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, result_size);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
-  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+  gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
 
-  Eigen::array<int, 1> redux_dim = {{redux}};
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+  Eigen::array<int, 1> redux_dim = {redux};
+  gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
 
   Tensor<Eigen::half, 1> half_prec(result_size);
   Tensor<Eigen::half, 1> full_prec(result_size);
@@ -357,50 +357,45 @@ void test_cuda_reductions(int size1, int size2, int redux) {
     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
   }
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
 
 template<typename>
-void test_cuda_reductions() {
-  test_cuda_reductions<void>(13, 13, 0);
-  test_cuda_reductions<void>(13, 13, 1);
+void test_gpu_reductions() {
+  test_gpu_reductions<void>(13, 13, 0);
+  test_gpu_reductions<void>(13, 13, 1);
 
-  test_cuda_reductions<void>(35, 36, 0);
-  test_cuda_reductions<void>(35, 36, 1);
+  test_gpu_reductions<void>(35, 36, 0);
+  test_gpu_reductions<void>(35, 36, 1);
 
-  test_cuda_reductions<void>(36, 35, 0);
-  test_cuda_reductions<void>(36, 35, 1);
+  test_gpu_reductions<void>(36, 35, 0);
+  test_gpu_reductions<void>(36, 35, 1);
 }
 
 template<typename>
-void test_cuda_full_reductions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_full_reductions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int size = 13;
   int num_elem = size*size;
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
       d_res_half);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
       d_res_float);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_float.device(gpu_device) = gpu_float.random();
 
-  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+  gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
 
   Tensor<Eigen::half, 0> half_prec;
   Tensor<Eigen::half, 0> full_prec;
@@ -410,24 +405,23 @@ void test_cuda_full_reductions() {
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
   gpu_device.synchronize();
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
 
 template<typename>
-void test_cuda_forced_evals() {
+void test_gpu_forced_evals() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -440,7 +434,7 @@ void test_cuda_forced_evals() {
       d_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
       d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
       d_res_half2, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, num_elem);
@@ -457,7 +451,7 @@ void test_cuda_forced_evals() {
   Tensor<float, 1> half_prec2(num_elem);
   Tensor<float, 1> full_prec(num_elem);
   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
   gpu_device.synchronize();
 
@@ -475,20 +469,20 @@ void test_cuda_forced_evals() {
 #endif
 
 
-void test_cxx11_tensor_of_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_numext<void>());
-
-#ifdef EIGEN_HAS_CUDA_FP16
-  CALL_SUBTEST_1(test_cuda_conversion<void>());
-  CALL_SUBTEST_1(test_cuda_unary<void>());
-  CALL_SUBTEST_1(test_cuda_elementwise<void>());
-  CALL_SUBTEST_1(test_cuda_trancendental<void>());
-  CALL_SUBTEST_2(test_cuda_contractions<void>());
-  CALL_SUBTEST_3(test_cuda_reductions<void>());
-  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
-  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+  CALL_SUBTEST_1(test_gpu_numext<void>());
+
+#ifdef EIGEN_HAS_GPU_FP16
+  CALL_SUBTEST_1(test_gpu_conversion<void>());
+  CALL_SUBTEST_1(test_gpu_unary<void>());
+  CALL_SUBTEST_1(test_gpu_elementwise<void>());
+  CALL_SUBTEST_1(test_gpu_trancendental<void>());
+  CALL_SUBTEST_2(test_gpu_contractions<void>());
+  CALL_SUBTEST_3(test_gpu_reductions<void>());
+  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
+  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
 #else
-  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index 4ef9aed91..159656276 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -141,7 +141,7 @@ static void test_initialization()
 }
 
 
-void test_cxx11_tensor_of_strings()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_strings)
 {
   // Beware: none of this is likely to ever work on a GPU.
   CALL_SUBTEST(test_assign());
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
index ffa19896e..b8a329deb 100644
--- a/unsupported/test/cxx11_tensor_padding.cpp
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -84,7 +84,7 @@ static void test_padded_expr()
   }
 }
 
-void test_cxx11_tensor_padding()
+EIGEN_DECLARE_TEST(cxx11_tensor_padding)
 {
   CALL_SUBTEST(test_simple_padding<ColMajor>());
   CALL_SUBTEST(test_simple_padding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
new file mode 100644
index 000000000..727a9ffd7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  IndexType padedSizeDim1 = 2;
+  IndexType padedSizeDim2 = 6;
+  IndexType padedSizeDim3 = 12;
+  IndexType padedSizeDim4 = 7;
+  array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange);
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings);
+  sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
+  for (IndexType i = 0; i < padedSizeDim1; ++i) {
+    for (IndexType j = 0; j < padedSizeDim2; ++j) {
+      for (IndexType k = 0; k < padedSizeDim3; ++k) {
+        for (IndexType l = 0; l < padedSizeDim4; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<IndexType, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+
+  Tensor<DataType, 2, DataLayout, IndexType>  result(reshape_dims);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 6; ++j) {
+      for (IndexType k = 0; k < 12; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
index 434359730..498ab8ca7 100644
--- a/unsupported/test/cxx11_tensor_patch.cpp
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -164,7 +164,7 @@ static void test_simple_patch()
   }
 }
 
-void test_cxx11_tensor_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_patch)
 {
    CALL_SUBTEST(test_simple_patch<ColMajor>());
    CALL_SUBTEST(test_simple_patch<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
new file mode 100644
index 000000000..7f92bec78
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 5> patchTensorRange;
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}};
+  }else{
+     patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}};
+  }
+
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange);
+
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> patch_dims;
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}};
+  }else{
+     patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange);
+  patchTensorBuffSize =single_patch.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange);
+
+  gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,2,1,2*2*4*7}};
+  }else{
+     patchTensorRange = {{2*2*4*7, 1, 2,2,1}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange);
+  patchTensorBuffSize =twod_patch.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange);
+
+  gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,3,5,2*2*3*3}};
+  }else{
+     patchTensorRange = {{2*2*3*3, 1, 2,3,5}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange);
+  patchTensorBuffSize =threed_patch.size()*sizeof(DataType);
+  DataType* gpu_data_threed_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange);
+
+  gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_patch);
+  sycl_device.deallocate(gpu_data_single_patch);
+  sycl_device.deallocate(gpu_data_twod_patch);
+  sycl_device.deallocate(gpu_data_threed_patch);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 0f3dc5787..b9d4c5584 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -11,9 +11,10 @@
 
 #include <Eigen/CXX11/Tensor>
 
+template<typename Scalar>
 static void test_default()
 {
-  Tensor<float, 1> vec(6);
+  Tensor<Scalar, 1> vec(6);
   vec.setRandom();
 
   // Fixme: we should check that the generated numbers follow a uniform
@@ -23,10 +24,11 @@ static void test_default()
   }
 }
 
+template<typename Scalar>
 static void test_normal()
 {
-  Tensor<float, 1> vec(6);
-  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+  Tensor<Scalar, 1> vec(6);
+  vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
 
   // Fixme: we should check that the generated numbers follow a gaussian
   // distribution instead.
@@ -70,9 +72,15 @@ static void test_custom()
   }
 }
 
-void test_cxx11_tensor_random()
+EIGEN_DECLARE_TEST(cxx11_tensor_random)
 {
-  CALL_SUBTEST(test_default());
-  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST((test_default<float>()));
+  CALL_SUBTEST((test_normal<float>()));
+  CALL_SUBTEST((test_default<double>()));
+  CALL_SUBTEST((test_normal<double>()));
+  CALL_SUBTEST((test_default<Eigen::half>()));
+  CALL_SUBTEST((test_normal<Eigen::half>()));
+  CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+  CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
   CALL_SUBTEST(test_custom());
 }
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
index b3be199e1..090986ebc 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@@ -9,18 +9,16 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
-void test_cuda_random_uniform()
+void test_gpu_random_uniform()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -28,24 +26,24 @@ void test_cuda_random_uniform()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
   gpu_out.device(gpu_device) = gpu_out.random();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  // For now we just check thes code doesn't crash.
+  // For now we just check this code doesn't crash.
   // TODO: come up with a valid test of randomness
 }
 
 
-void test_cuda_random_normal()
+void test_gpu_random_normal()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -53,9 +51,9 @@ void test_cuda_random_normal()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@@ -63,8 +61,8 @@ void test_cuda_random_normal()
   Eigen::internal::NormalRandomGenerator<float> gen(true);
   gpu_out.device(gpu_device) = gpu_out.random(gen);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 }
 
 static void test_complex()
@@ -80,9 +78,9 @@ static void test_complex()
 }
 
 
-void test_cxx11_tensor_random_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
 {
-  CALL_SUBTEST(test_cuda_random_uniform());
-  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_gpu_random_uniform());
+  CALL_SUBTEST(test_gpu_random_normal());
   CALL_SUBTEST(test_complex());
 }
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 000000000..6c83894a3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+  gpu_out.device(sycl_device)=gpu_out.random();
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+  out.setZero();
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+  gpu_out.device(sycl_device)=gpu_out.random(gen);
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 1490ec3da..c46c4c91d 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -53,20 +53,22 @@ static void test_trivial_reductions() {
   }
 }
 
-template <int DataLayout>
+template <typename Scalar,int DataLayout>
 static void test_simple_reductions() {
-  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7);
   tensor.setRandom();
+  // Add a little offset so that the product reductions won't be close to zero.
+  tensor += tensor.constant(Scalar(0.5f));
   array<ptrdiff_t, 2> reduction_axis2;
   reduction_axis2[0] = 1;
   reduction_axis2[1] = 3;
 
-  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 5);
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 5; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       for (int k = 0; k < 3; ++k) {
         for (int l = 0; l < 7; ++l) {
           sum += tensor(i, k, j, l);
@@ -77,7 +79,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum();
     VERIFY_IS_EQUAL(sum1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -85,7 +87,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
     VERIFY_IS_EQUAL(sum2.rank(), 0);
 
     VERIFY_IS_APPROX(sum1(), sum2());
@@ -98,7 +100,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float prod = 1.0f;
+      Scalar prod = Scalar(1.0f);
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           prod *= tensor(k, i, l, j);
@@ -109,7 +111,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod();
     VERIFY_IS_EQUAL(prod1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -117,7 +119,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
     VERIFY_IS_EQUAL(prod2.rank(), 0);
 
     VERIFY_IS_APPROX(prod1(), prod2());
@@ -130,7 +132,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float max_val = std::numeric_limits<float>::lowest();
+      Scalar max_val = std::numeric_limits<Scalar>::lowest();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           max_val = (std::max)(max_val, tensor(k, i, l, j));
@@ -141,7 +143,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum();
     VERIFY_IS_EQUAL(max1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -149,7 +151,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
     VERIFY_IS_EQUAL(max2.rank(), 0);
 
     VERIFY_IS_APPROX(max1(), max2());
@@ -162,7 +164,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float min_val = (std::numeric_limits<float>::max)();
+      Scalar min_val = (std::numeric_limits<Scalar>::max)();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
           min_val = (std::min)(min_val, tensor(k, l, i, j));
@@ -173,7 +175,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum();
     VERIFY_IS_EQUAL(min1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -181,7 +183,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
     VERIFY_IS_EQUAL(min2.rank(), 0);
 
     VERIFY_IS_APPROX(min1(), min2());
@@ -194,7 +196,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       int count = 0;
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
@@ -202,12 +204,12 @@ static void test_simple_reductions() {
           ++count;
         }
       }
-      VERIFY_IS_APPROX(result(i, j), sum / count);
+      VERIFY_IS_APPROX(result(i, j), sum / Scalar(count));
     }
   }
 
   {
-    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean();
     VERIFY_IS_EQUAL(mean1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -215,7 +217,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
     VERIFY_IS_EQUAL(mean2.rank(), 0);
 
     VERIFY_IS_APPROX(mean1(), mean2());
@@ -225,11 +227,11 @@ static void test_simple_reductions() {
     Tensor<int, 1> ints(10);
     std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
 
-    TensorFixedSize<bool, Sizes<> > all;
-    all = ints.all();
-    VERIFY(!all());
-    all = (ints >= ints.constant(0)).all();
-    VERIFY(all());
+    TensorFixedSize<bool, Sizes<> > all_;
+    all_ = ints.all();
+    VERIFY(!all_());
+    all_ = (ints >= ints.constant(0)).all();
+    VERIFY(all_());
 
     TensorFixedSize<bool, Sizes<> > any;
     any = (ints > ints.constant(10)).any();
@@ -368,7 +370,7 @@ static void test_static_dims() {
   Tensor<float, 2, DataLayout> out(72, 97);
   in.setRandom();
 
-#if !EIGEN_HAS_CONSTEXPR 
+#if !EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
@@ -386,7 +388,7 @@ static void test_static_dims() {
           expected = (std::max)(expected, in(i, k, j, l));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -417,7 +419,7 @@ static void test_innermost_last_dims() {
           expected = (std::max)(expected, in(l, k, i, j));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -448,7 +450,7 @@ static void test_innermost_first_dims() {
           expected = (std::max)(expected, in(i, j, k, l));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -479,16 +481,37 @@ static void test_reduce_middle_dims() {
           expected = (std::max)(expected, in(i, k, l, j));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+static void test_sum_accuracy() {
+  Tensor<float, 3> tensor(101, 101, 101);
+  for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
+    tensor.setRandom();
+    tensor += tensor.constant(prescribed_mean);
+
+    Tensor<float, 0> sum = tensor.sum();
+    double expected_sum = 0.0;
+    for (int i = 0; i < 101; ++i) {
+      for (int j = 0; j < 101; ++j) {
+        for (int k = 0; k < 101; ++k) {
+          expected_sum += static_cast<double>(tensor(i, j, k));
+        }
+      }
     }
+    VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
   }
 }
 
-void test_cxx11_tensor_reduction() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
   CALL_SUBTEST(test_trivial_reductions<ColMajor>());
   CALL_SUBTEST(test_trivial_reductions<RowMajor>());
-  CALL_SUBTEST(test_simple_reductions<ColMajor>());
-  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::bfloat16,ColMajor>() ));
   CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
   CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
@@ -505,4 +528,5 @@ void test_cxx11_tensor_reduction() {
   CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+  CALL_SUBTEST(test_sum_accuracy());
 }
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
index 6858b43a7..122ac946b 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu
@@ -9,12 +9,9 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -22,7 +19,7 @@
 template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -70,7 +67,7 @@ static void test_first_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data(T)
@@ -110,7 +107,7 @@ static void test_last_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data
@@ -137,7 +134,7 @@ static void test_last_dim_reductions() {
 }
 
 
-void test_cxx11_tensor_reduction_cuda() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index a9ef82907..a297716e4 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -13,38 +13,168 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 753;
+  const IndexType num_cols = 537;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
 
+  array<IndexType, 2> outRange = {{1, 1}};
 
-static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
 
-  const int num_rows = 452;
-  const int num_cols = 765;
-  array<int, 2> tensorRange = {{num_rows, num_cols}};
+  in.setRandom();
+  auto dim = DSizes<IndexType, 2>(1, 1);
+  full_redux = in.sum().reshape(dim);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+      sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+                                                                outRange);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+  sycl_device.memcpyDeviceToHost(
+      full_redux_gpu.data(), gpu_out_data,
+      (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+            << ", CPU FULL: " << full_redux(0, 0) << "\n";
+  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
 
-  Tensor<float, 2> in(tensorRange);
-  Tensor<float, 0> full_redux;
-  Tensor<float, 0> full_redux_gpu;
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
 
   in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.sum();
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  full_redux = in.sum();
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
-  TensorMap<Tensor<float, 2> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 0> >  out_gpu(gpu_out_data);
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.maximum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the max.
+  // As we don't include this in the reduction the result should not be 2.
+  in(0) = static_cast<DataType>(2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.maximum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
-  out_gpu.device(sycl_device) = in_gpu.sum();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
@@ -52,87 +182,833 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
   sycl_device.deallocate(gpu_out_data);
 }
 
-static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+  array<IndexType, 1> argRange = {{num_cols}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  //  red_axis[1]=1;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+  in_arg1.setRandom();
+  in_arg2.setRandom();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+      gpu_in_arg1_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+      gpu_in_arg2_data, tensorRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+      gpu_out_arg_data, argRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+      gpu_out_arg__gpu_helper_data, argRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  // CPU VERSION
+  out_arg_cpu =
+      (in_arg1.argmax(1) == in_arg2.argmax(1))
+          .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+  full_redux = (out_arg_cpu.template cast<float>())
+                   .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+  // GPU VERSION
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg1_data, in_arg1.data(),
+      (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg2_data, in_arg2.data(),
+      (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+  out_Argout_gpu_helper.device(sycl_device) =
+      (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+  out_Argout_gpu.device(sycl_device) =
+      (out_Argout_gpu_helper)
+          .select(out_Argout_gpu.constant(true),
+                  out_Argout_gpu.constant(false));
+  out_gpu.device(sycl_device) =
+      (out_Argout_gpu.template cast<float>())
+          .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+            << '\n';
+  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_in_arg1_data);
+  sycl_device.deallocate(gpu_in_arg2_data);
+  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+  sycl_device.deallocate(gpu_out_arg_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.mean();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_odd_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  // This is a particular case which illustrates a possible problem when the
+  // number of local threads in a workgroup is even, but is not a power of two.
+  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+  const IndexType n_elems = 8707;
+  array<IndexType, 1> tensor_range = {{n_elems}};
+
+  data_tensor in(tensor_range);
+  DataType full_redux;
+  DataType full_redux_gpu;
+  TensorMap<scalar_tensor> red_cpu(&full_redux);
+  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+  const DataType const_val = static_cast<DataType>(0.6391);
+  in = in.constant(const_val);
+
+  Eigen::IndexList<Eigen::type2index<0>> red_axis;
+  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  VERIFY_IS_APPROX(const_val, red_cpu());
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) =
+      in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 876;
+  const IndexType num_cols = 953;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.minimum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the min.
+  // As we don't include this in the reduction the result should not be -2.
+  in(0) = static_cast<DataType>(-2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.minimum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.maximum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> reduced_range = {{num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = num_cols;
 
-  int dim_x = 145;
-  int dim_y = 1;
-  int dim_z = 67;
+  data_tensor in(tensor_range);
+  reduced_tensor redux;
+  reduced_tensor redux_gpu(reduced_range);
 
-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
-  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
 
-  Tensor<float, 3> in(tensorRange);
-  Tensor<float, 2> redux(reduced_tensorRange);
-  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  redux = in_offset.maximum(red_axis);
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> full_reduced_range = {{num_rows}};
+  array<IndexType, 1> reduced_range = {{num_rows - 1}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = reduced_range[0];
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux(full_reduced_range);
+  reduced_tensor redux_gpu(reduced_range);
 
   in.setRandom();
+  redux.setZero();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 1;
+
+  const IndexType offset = 64;
+  // Introduce an offset in both the input and the output.
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+  red_offset = in_offset.maximum(red_axis);
+
+  // Check that the first value hasn't been changed and that the reduced values
+  // are not equal to the previously set maximum in the input outside the range.
+  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+                                 n_reduced * sizeof(DataType));
 
-  redux= in.sum(red_axis);
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
 
-  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  in.setRandom();
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
 
   // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType i = 0; i < redux.size(); i++) {
+    VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+  }
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  int dim_x = 567;
-  int dim_y = 1;
-  int dim_z = 47;
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 2;
-  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
 
-  Tensor<float, 3> in(tensorRange);
-  Tensor<float, 2> redux(reduced_tensorRange);
-  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
 
   in.setRandom();
 
-  redux= in.sum(red_axis);
+  redux = in.sum(red_axis);
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  // auto red_axis =  Sizes<0,1>(0,1);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+
+  redux_fix = in_fix.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
 
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+  redux_fix = in_fix.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  sycl_device.synchronize();
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  static constexpr OutT InfBit = 1;
+  static constexpr OutT NanBit = 2;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+                                                    OutT* accum) const {
+    if (Eigen::numext::isinf(x))
+      *accum |= InfBit;
+    else if (Eigen::numext::isnan(x))
+      *accum |= NanBit;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+                                                    OutT* accum) const {
+    *accum |= x;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+    return OutT(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+    return accum;
+  }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+          typename IndexType>
+static void test_full_reductions_custom_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  constexpr IndexType InSize = 64;
+  auto tensorRange = Sizes<InSize>(InSize);
+  Eigen::IndexList<Eigen::type2index<0>> dims;
+  auto reduced_tensorRange = Sizes<>();
+  TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+  TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+  CustomReducer<DataType, AccumType> reducer;
+
+  in_fix.setRandom();
+
+  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+  AccumType* gpu_out_data =
+      static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+                                 sizeof(AccumType));
+  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
 }
 
-void test_cxx11_tensor_reduction_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
-  CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
-  CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  129, 8);
+  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
+  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  sycl_device.synchronize();
+}
 
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index c8f105e3d..7dbd0478c 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -235,7 +235,7 @@ static void test_nested_ops_with_ref()
 }
 
 
-void test_cxx11_tensor_ref()
+EIGEN_DECLARE_TEST(cxx11_tensor_ref)
 {
   CALL_SUBTEST(test_simple_lvalue_ref());
   CALL_SUBTEST(test_simple_rvalue_ref());
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index b35b8d29e..5e44ec007 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue)
 }
 
 
-void test_cxx11_tensor_reverse()
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse)
 {
   CALL_SUBTEST(test_simple_reverse<ColMajor>());
   CALL_SUBTEST(test_simple_reverse<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
new file mode 100644
index 000000000..dd30c235d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+                                                                 tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(i, 2 - j, 4 - k, l));
+        }
+      }
+    }
+  }
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(1 - i, j, k, 6 - l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+                              bool LValue) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+      expected.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_result = static_cast<DataType*>(
+      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+      gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+      gpu_out_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+
+  if (LValue) {
+    out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+  } else {
+    out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+  }
+  sycl_device.memcpyDeviceToHost(
+      expected.data(), gpu_out_data_expected,
+      expected.dimensions().TotalSize() * sizeof(DataType));
+
+  array<IndexType, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<IndexType, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<IndexType, 4> dst_slice_dim = src_slice_dim;
+  array<IndexType, 4> dst_slice_start = src_slice_start;
+
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  sycl_device.memcpyHostToDevice(
+      gpu_out_data_result, result.data(),
+      (result.dimensions().TotalSize()) * sizeof(DataType));
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    }
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.get_info<cl::sycl::info::device::name>() << std::endl;
+    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
index 2c26151ab..83b592384 100644
--- a/unsupported/test/cxx11_tensor_roundings.cpp
+++ b/unsupported/test/cxx11_tensor_roundings.cpp
@@ -54,7 +54,7 @@ static void test_float_ceiling()
   }
 }
 
-void test_cxx11_tensor_roundings()
+EIGEN_DECLARE_TEST(cxx11_tensor_roundings)
 {
    CALL_SUBTEST(test_float_rounding());
    CALL_SUBTEST(test_float_ceiling());
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index af59aa3ef..dccee9e84 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -98,7 +98,7 @@ static void test_tensor_maps() {
   }
 }
 
-void test_cxx11_tensor_scan() {
+EIGEN_DECLARE_TEST(cxx11_tensor_scan) {
   CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
   CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
   CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
index 5f146f3c9..770a144f1 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@@ -9,21 +9,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_cumsum(int m_size, int k_size, int n_size)
+void test_gpu_cumsum(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@@ -38,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   float* d_t_input;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_input), t_input_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_input), t_input_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@@ -54,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
   t_result = t_input.cumsum(1);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -67,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_input);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_input);
+  gpuFree((void*)d_t_result);
 }
 
 
-void test_cxx11_tensor_scan_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
 }
diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 000000000..09c45fce5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+                      IndexType k_size, IndexType n_size, int consume_dim,
+                      bool exclusive) {
+  static const DataType error_threshold = 1e-4f;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+            << " consume_dim : " << consume_dim << ")" << std::endl;
+  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+                                                          n_size);
+
+  t_input.setRandom();
+  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType* gpu_data_in =
+      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+  DataType* gpu_data_out =
+      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+      gpu_data_in, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+      gpu_data_out, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+  t_result = t_input.cumsum(consume_dim, exclusive);
+
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+                                 t_result_bytes);
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+              << " vs SYCL : " << t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(gpu_data_in);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(
+        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(
+        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(
+        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_5(
+        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_6(
+        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index d11444a14..89a64c021 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
   Tensor<float, 4, DataLayout> expected;
   expected = tensor.shuffle(shuffles);
 
-  Tensor<float, 4, DataLayout> result(5,7,3,2);
+  Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
 
-  array<int, 4> src_slice_dim{{2,3,1,7}};
-  array<int, 4> src_slice_start{{0,0,0,0}};
-  array<int, 4> dst_slice_dim{{1,7,3,2}};
-  array<int, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}};
+  array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}};
+  array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}};
+  array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}};
 
   for (int i = 0; i < 5; ++i) {
     result.slice(dst_slice_start, dst_slice_dim) =
@@ -215,7 +215,60 @@ static void test_shuffle_unshuffle()
 }
 
 
-void test_cxx11_tensor_shuffling()
+template <int DataLayout>
+static void test_empty_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,0,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 0);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
 {
   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
   CALL_SUBTEST(test_simple_shuffling<RowMajor>());
@@ -225,4 +278,6 @@ void test_cxx11_tensor_shuffling()
   CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+  CALL_SUBTEST(test_empty_shuffling<ColMajor>());
+  CALL_SUBTEST(test_empty_shuffling<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
new file mode 100644
index 000000000..ca4e8b5ef
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
+  tensor.setRandom();
+
+  const size_t buffSize = tensor.size() * sizeof(DataType);
+  array<IndexType, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+                                                             tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+                                                             tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
+
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  array<IndexType, 4> tensorrangeShuffle = {
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+      gpu_data3, tensorrangeShuffle);
+
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 5a0d339ef..6d70f5435 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -316,7 +316,7 @@ static void test_resize()
   VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
 }
 
-void test_cxx11_tensor_simple()
+EIGEN_DECLARE_TEST(cxx11_tensor_simple)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
index 935b908cc..aefdfa9b4 100644
--- a/unsupported/test/cxx11_tensor_striding.cpp
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -110,7 +110,7 @@ static void test_striding_as_lvalue()
 }
 
 
-void test_cxx11_tensor_striding()
+EIGEN_DECLARE_TEST(cxx11_tensor_striding)
 {
   CALL_SUBTEST(test_simple_striding<ColMajor>());
   CALL_SUBTEST(test_simple_striding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
new file mode 100644
index 000000000..d3b1fa77c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  //no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+//  stride = tensor.stride(strides);
+
+  gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (IndexType i = 0; i < 1; ++i) {
+    for (IndexType j = 0; j < 1; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        for (IndexType l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+  //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+//  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+//  result.stride(strides) = tensor;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+//  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+//  result2.stride(strides) = tensor.stride(no_strides);
+
+  gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+  test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorStridingPerDevice(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index 2f56eb495..2ca5c47db 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() {
   }
 }
 
-void test_cxx11_tensor_sugar()
+EIGEN_DECLARE_TEST(cxx11_tensor_sugar)
 {
   CALL_SUBTEST(test_comparison_sugar());
   CALL_SUBTEST(test_scalar_sugar_add_mul());
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 6a9c33422..e6c5e2378 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -15,8 +15,8 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
@@ -27,36 +27,188 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 5;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
+
+  in1 = in1.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
+  gpu1.device(sycl_device) = gpu1 * 3.14f;
+  gpu2.device(sycl_device) = gpu2 * 2.7f;
+  sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
+    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
+  }
+
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
+  IndexType size = 20;
+  array<IndexType, 1> tensorRange = {{size}};
+  Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
+
+  in1 = in1.random();
+  in2 = in1;
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  in1.setZero();
+
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), in2(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type in1(tensorRange);
+  tensor_type out(tensorRange);
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  in1 = in1.random();
+  // Copy all data to device, then permute on copy back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  // Permute copies to device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+  // Copy all to device, permute copies on device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  sycl_device.deallocate(gpu_data_out);
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type cpu_out(tensorRange);
+  tensor_type out(tensorRange);
+
+  cpu_out.setZero();
+
+  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < full_size; ++i) {
+    VERIFY_IS_APPROX(out(i), cpu_out(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
 
-  int sizeDim1 = 100;
-  int sizeDim2 = 100;
-  int sizeDim3 = 100;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<float, 3> in1(tensorRange);
-  Tensor<float, 3> in2(tensorRange);
-  Tensor<float, 3> in3(tensorRange);
-  Tensor<float, 3> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
 
   in2 = in2.random();
   in3 = in3.random();
 
-  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in3_data  = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_in3_data  = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
 
-  TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
 
   /// a=1.2f
   gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
-  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
       }
     }
@@ -65,10 +217,12 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// a=b*1.2f
   gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) * 1.2f);
       }
@@ -77,12 +231,14 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   printf("a=b*1.2f Test Passed\n");
 
   /// c=a*b
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
   gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) *
                              in2(i,j,k));
@@ -93,10 +249,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// c=a+b
   gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) +
                              in2(i,j,k));
@@ -107,10 +264,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// c=a*a
   gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) *
                              in1(i,j,k));
@@ -121,10 +279,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   //a*3.14f + b*2.7f
   gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
-  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) * 3.14f
                        + in2(i,j,k) * 2.7f);
@@ -134,12 +293,13 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   printf("a*3.14f + b*2.7f Test Passed\n");
 
   ///d= (a>0.5? b:c)
-  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
   gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
                                                 ? in2(i, j, k)
                                                 : in3(i, j, k));
@@ -152,8 +312,50 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   sycl_device.deallocate(gpu_in3_data);
   sycl_device.deallocate(gpu_out_data);
 }
-void test_cxx11_tensor_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_sycl_cpu(sycl_device));
+template<typename Scalar1, typename Scalar2,  int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+    IndexType size = 20;
+    array<IndexType, 1> tensorRange = {{size}};
+    Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+    in = in.random();
+
+    Scalar1* gpu_in_data  = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+    Scalar2 * gpu_out_data =  static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+    TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+    TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+    sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+    gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+    out_host = in. template cast<Scalar2>();
+    for(IndexType i=0; i< size; i++)
+    {
+      VERIFY_IS_APPROX(out(i), out_host(i));
+    }
+    printf("cast Test Passed\n");
+    sycl_device.deallocate(gpu_in_data);
+    sycl_device.deallocate(gpu_out_data);
+}
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
index d680e9b3b..fed269a9a 100644
--- a/unsupported/test/cxx11_tensor_symmetry.cpp
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -801,7 +801,7 @@ static void test_tensor_randacc()
   }
 }
 
-void test_cxx11_tensor_symmetry()
+EIGEN_DECLARE_TEST(cxx11_tensor_symmetry)
 {
   CALL_SUBTEST(test_symgroups_static());
   CALL_SUBTEST(test_symgroups_dynamic());
diff --git a/unsupported/test/cxx11_tensor_thread_local.cpp b/unsupported/test/cxx11_tensor_thread_local.cpp
new file mode 100644
index 000000000..7e866f6d1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_local.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <iostream>
+#include <unordered_set>
+
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+struct Counter {
+  Counter() = default;
+
+  void inc() {
+    // Check that mutation happens only in a thread that created this counter.
+    VERIFY_IS_EQUAL(std::this_thread::get_id(), created_by);
+    counter_value++;
+  }
+  int value() { return counter_value; }
+
+  std::thread::id created_by;
+  int counter_value = 0;
+};
+
+struct InitCounter {
+  void operator()(Counter& counter) {
+    counter.created_by = std::this_thread::get_id();
+  }
+};
+
+void test_simple_thread_local() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 3 * num_threads;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  counter.ForEach(
+      [](std::thread::id, Counter& cnt) { VERIFY_IS_EQUAL(cnt.value(), 3); });
+}
+
+void test_zero_sized_thread_local() {
+  Eigen::ThreadLocal<Counter, InitCounter> counter(0, InitCounter());
+
+  Counter& local = counter.local();
+  local.inc();
+
+  int total = 0;
+  counter.ForEach([&total](std::thread::id, Counter& cnt) {
+    total += cnt.value();
+    VERIFY_IS_EQUAL(cnt.value(), 1);
+  });
+
+  VERIFY_IS_EQUAL(total, 1);
+}
+
+// All thread local values fits into the lock-free storage.
+void test_large_number_of_tasks_no_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+// Lock free thread local storage is too small to fit all the unique threads,
+// and it spills to a map guarded by a mutex.
+void test_large_number_of_tasks_with_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(1, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_local) {
+  CALL_SUBTEST(test_simple_thread_local());
+  CALL_SUBTEST(test_zero_sized_thread_local());
+  CALL_SUBTEST(test_large_number_of_tasks_no_spill());
+  CALL_SUBTEST(test_large_number_of_tasks_with_spill());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2ef665f30..b772a1d60 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -16,29 +16,72 @@
 
 using Eigen::Tensor;
 
+class TestAllocator : public Allocator {
+ public:
+  ~TestAllocator() EIGEN_OVERRIDE {}
+  EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->alloc_count_++;
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->dealloc_count_++;
+    internal::aligned_free(buffer);
+  }
+
+  int alloc_count() const { return alloc_count_; }
+  int dealloc_count() const { return dealloc_count_; }
+
+ private:
+  int alloc_count_ = 0;
+  int dealloc_count_ = 0;
+};
 
 void test_multithread_elementwise()
 {
-  Tensor<float, 3> in1(2,3,7);
-  Tensor<float, 3> in2(2,3,7);
-  Tensor<float, 3> out(2,3,7);
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
 
   in1.setRandom();
   in2.setRandom();
 
   Eigen::ThreadPool tp(internal::random<int>(3, 11));
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
-  out.device(thread_pool_device) = in1 + in2 * 3.14f;
+  out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
 
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
       }
     }
   }
 }
 
+void test_async_multithread_elementwise()
+{
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+  Eigen::Barrier b(1);
+  out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
+  b.Wait();
+
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+      }
+    }
+  }
+}
 
 void test_multithread_compound_assignment()
 {
@@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() {
   }
 }
 
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+  int contract_size = internal::random<int>(100, 500);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+                                    contract_size,
+                                    internal::random<int>(10, 40));
+
+  Tensor<float, 4, DataLayout> right(
+      internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+      internal::random<int>(1, 20));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+  Eigen::Barrier barrier(1);
+  tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+      left.contract(right, dims);
+  barrier.Wait();
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test
+    // will fail due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims);
+  barrier.Wait();
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims, SqrtOutputKernel());
+  barrier.Wait();
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
 
 template<int DataLayout>
 void test_full_contraction() {
@@ -320,14 +630,14 @@ void test_multithread_random()
 }
 
 template<int DataLayout>
-void test_multithread_shuffle()
+void test_multithread_shuffle(Allocator* allocator)
 {
   Tensor<float, 4, DataLayout> tensor(17,5,7,11);
   tensor.setRandom();
 
   const int num_threads = internal::random<int>(2, 11);
   ThreadPool threads(num_threads);
-  Eigen::ThreadPoolDevice device(&threads, num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
 
   Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
   array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
@@ -344,10 +654,26 @@ void test_multithread_shuffle()
   }
 }
 
+void test_threadpool_allocate(TestAllocator* allocator)
+{
+  const int num_threads = internal::random<int>(2, 11);
+  const int num_allocs = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+  for (int a = 0; a < num_allocs; ++a) {
+    void* ptr = device.allocate(512);
+    device.deallocate(ptr);
+  }
+  VERIFY(allocator != NULL);
+  VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
+  VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
+}
 
-void test_cxx11_tensor_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
 {
   CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_async_multithread_elementwise());
   CALL_SUBTEST_1(test_multithread_compound_assignment());
 
   CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
@@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool()
 
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Test EvalShardedByInnerDimContext parallelization strategy.
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
-  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_8(test_full_contraction<RowMajor>());
+
+  CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
 
-  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
-  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+  CALL_SUBTEST_10(test_memcpy());
+  CALL_SUBTEST_10(test_multithread_random());
 
-  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
-  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+  TestAllocator test_allocator;
+  CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+  CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+  CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
 
-  CALL_SUBTEST_6(test_memcpy());
-  CALL_SUBTEST_6(test_multithread_random());
-  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
-  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
 }
diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
new file mode 100644
index 000000000..009722895
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_trace.cpp
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_0D_trace() {
+  Tensor<float, 0, DataLayout> tensor;
+  tensor.setRandom();
+  array<ptrdiff_t, 0> dims;
+  Tensor<float, 0, DataLayout> result = tensor.trace(dims);
+  VERIFY_IS_EQUAL(result(), tensor());
+}
+
+
+template <int DataLayout>
+static void test_all_dimensions_trace() {
+  Tensor<float, 3, DataLayout> tensor1(5, 5, 5);
+  tensor1.setRandom();
+  Tensor<float, 0, DataLayout> result1 = tensor1.trace();
+  VERIFY_IS_EQUAL(result1.rank(), 0);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum += tensor1(i, i, i);
+  }
+  VERIFY_IS_EQUAL(result1(), sum);
+
+  Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 5> dims = { { 2, 1, 0, 3, 4 } };
+  Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims);
+  VERIFY_IS_EQUAL(result2.rank(), 0);
+  sum = 0.0f;
+  for (int i = 0; i < 7; ++i) {
+    sum += tensor2(i, i, i, i, i);
+  }
+  VERIFY_IS_EQUAL(result2(), sum);
+}
+
+
+template <int DataLayout>
+static void test_simple_trace() {
+  Tensor<float, 3, DataLayout> tensor1(3, 5, 3);
+  tensor1.setRandom();
+  array<ptrdiff_t, 2> dims1 = { { 0, 2 } };
+  Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1);
+  VERIFY_IS_EQUAL(result1.rank(), 1);
+  VERIFY_IS_EQUAL(result1.dimension(0), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum = 0.0f;
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor1(j, i, j);
+    }
+    VERIFY_IS_EQUAL(result1(i), sum);
+  }
+
+  Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 2> dims2 = { { 2, 3 } };
+  Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2);
+  VERIFY_IS_EQUAL(result2.rank(), 2);
+  VERIFY_IS_EQUAL(result2.dimension(0), 5);
+  VERIFY_IS_EQUAL(result2.dimension(1), 5);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 7; ++k) {
+        sum += tensor2(i, j, k, k);
+      }
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+    }
+  }
+
+  array<ptrdiff_t, 2> dims3 = { { 1, 0 } };
+  Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3);
+  VERIFY_IS_EQUAL(result3.rank(), 2);
+  VERIFY_IS_EQUAL(result3.dimension(0), 7);
+  VERIFY_IS_EQUAL(result3.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        sum += tensor2(k, k, i, j);
+      }
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3);
+  tensor3.setRandom();
+  array<ptrdiff_t, 3> dims4 = { { 0, 2, 4 } };
+  Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4);
+  VERIFY_IS_EQUAL(result4.rank(), 2);
+  VERIFY_IS_EQUAL(result4.dimension(0), 7);
+  VERIFY_IS_EQUAL(result4.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor3(k, i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result4(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5);
+  tensor4.setRandom();
+  array<ptrdiff_t, 2> dims5 = { { 1, 3 } };
+  Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5);
+  VERIFY_IS_EQUAL(result5.rank(), 3);
+  VERIFY_IS_EQUAL(result5.dimension(0), 3);
+  VERIFY_IS_EQUAL(result5.dimension(1), 4);
+  VERIFY_IS_EQUAL(result5.dimension(2), 5);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        sum = 0.0f;
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor4(i, l, j, l, k);
+        }
+        VERIFY_IS_EQUAL(result5(i, j, k), sum);
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_trace_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> dims = { { 1, 3 } };
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.trace(dims);
+  VERIFY_IS_EQUAL(result.rank(), 2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor(i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_trace) {
+  CALL_SUBTEST(test_0D_trace<ColMajor>());
+  CALL_SUBTEST(test_0D_trace<RowMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<RowMajor>());
+  CALL_SUBTEST(test_simple_trace<ColMajor>());
+  CALL_SUBTEST(test_simple_trace<RowMajor>());
+  CALL_SUBTEST(test_trace_in_expr<ColMajor>());
+  CALL_SUBTEST(test_trace_in_expr<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index d2a1e8673..46fceaa19 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -12,7 +12,7 @@
 #include <Eigen/CXX11/Tensor>
 
 
-#if EIGEN_COMP_MSVC
+#if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__)
 #define EIGEN_NO_INT128
 #else
 typedef __uint128_t uint128_t;
@@ -144,7 +144,7 @@ void test_misc2() {
 #endif
 
 
-void test_cxx11_tensor_uint128()
+EIGEN_DECLARE_TEST(cxx11_tensor_uint128)
 {
 #ifdef EIGEN_NO_INT128
   // Skip the test on compilers that don't support 128bit integers natively
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
index ca6840f3b..862212e82 100644
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -70,9 +70,9 @@ static void test_entire_volume_patch()
   const int dy = patch_y - 1;
   const int dx = patch_x - 1;
 
-  const int forward_pad_z = dz - dz / 2;
-  const int forward_pad_y = dy - dy / 2;
-  const int forward_pad_x = dx - dx / 2;
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
 
   for (int pz = 0; pz < patch_z; pz++) {
     for (int py = 0; py < patch_y; py++) {
@@ -105,7 +105,7 @@ static void test_entire_volume_patch()
   }
 }
 
-void test_cxx11_tensor_volume_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch)
 {
   CALL_SUBTEST(test_single_voxel_patch());
   CALL_SUBTEST(test_entire_volume_patch());
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
new file mode 100644
index 000000000..8d99a48ed
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+IndexType sizeDim0 = 4;
+IndexType sizeDim1 = 2;
+IndexType sizeDim2 = 3;
+IndexType sizeDim3 = 5;
+IndexType sizeDim4 = 7;
+array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}};
+Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+tensor_col_major.setRandom();
+
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+
+
+  // single volume patch: ColMajor
+  array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange);
+  gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}};
+  Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+       VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+  }
+
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}};
+  array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+
+    DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+    DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+    TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+    TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+    sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+    gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+    sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+
+    // single volume patch: ColMajor
+    array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}};
+    Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange);
+    size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType);
+    DataType* gpu_data_entire_volume_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+    TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange);
+    gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x);
+    sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize);
+
+
+//  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+//  tensor.setRandom();
+//  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  //Tensor<float, 6> entire_volume_patch;
+  //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch);
+
+//  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_volume_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor_col_major(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_row_major);
+}
+
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl;
+test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
+test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp
index 2b11807c8..5f63161b2 100644
--- a/unsupported/test/dgmres.cpp
+++ b/unsupported/test/dgmres.cpp
@@ -9,7 +9,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "../../test/sparse_solver.h"
-#include <Eigen/src/IterativeSolvers/DGMRES.h>
+#include <unsupported/Eigen/IterativeSolvers>
 
 template<typename T> void test_dgmres_T()
 {
@@ -24,7 +24,7 @@ template<typename T> void test_dgmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor)     );
 }
 
-void test_dgmres()
+EIGEN_DECLARE_TEST(dgmres)
 {
   CALL_SUBTEST_1(test_dgmres_T<double>());
   CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index 866db8e86..14a909d3b 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp
@@ -35,7 +35,7 @@ struct TestFunc1
   int m_inputs, m_values;
 
   TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -119,7 +119,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-void test_forward_adolc()
+EIGEN_DECLARE_TEST(forward_adolc)
 {
   adtl::setNumDir(NUMBER_DIRECTIONS);
 
@@ -132,7 +132,7 @@ void test_forward_adolc()
   }
 
   {
-    // simple instanciation tests
+    // simple instantiation tests
     Matrix<adtl::adouble,2,1> x;
     foo(x);
     Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp
index f2969116b..8d2254b5b 100644
--- a/unsupported/test/gmres.cpp
+++ b/unsupported/test/gmres.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_gmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor)     );
 }
 
-void test_gmres()
+EIGEN_DECLARE_TEST(gmres)
 {
   CALL_SUBTEST_1(test_gmres_T<double>());
   CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
diff --git a/unsupported/test/idrs.cpp b/unsupported/test/idrs.cpp
new file mode 100644
index 000000000..f88c01632
--- /dev/null
+++ b/unsupported/test/idrs.cpp
@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_idrs_T()
+{
+  IDRS<SparseMatrix<T>, DiagonalPreconditioner<T> > idrs_colmajor_diag;
+  IDRS<SparseMatrix<T>, IncompleteLUT<T> >           idrs_colmajor_ilut;
+
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_ilut)     );
+}
+
+EIGEN_DECLARE_TEST(idrs)
+{
+  CALL_SUBTEST_1(test_idrs_T<double>());
+  CALL_SUBTEST_2(test_idrs_T<std::complex<double> >());
+}
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index e770049e5..b5b764c65 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,6 +9,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+
 #ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
@@ -83,7 +84,7 @@ void check_sparse_kronecker_product(const MatrixType& ab)
 }
 
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   // DM = dense matrix; SM = sparse matrix
 
@@ -95,7 +96,7 @@ void test_kronecker_product()
   SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
   SM_a.insert(1,1) = DM_a.coeffRef(1,1) =  0.6469156566545853;
   SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
- 
+
   MatrixXd             DM_b(3,2);
   SparseMatrix<double> SM_b(3,2);
   SM_b.insert(0,0) = DM_b.coeffRef(0,0) =  0.9004440976767099;
@@ -165,7 +166,7 @@ void test_kronecker_product()
   SM_a.insert(0,3) = -0.2;
   SM_a.insert(2,4) =  0.3;
   SM_a.finalize();
-  
+
   SM_b.insert(0,0) =  0.4;
   SM_b.insert(2,1) = -0.5;
   SM_b.finalize();
@@ -183,7 +184,7 @@ void test_kronecker_product()
   DM_b2.resize(4,8);
   DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
   CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
-  
+
   for(int i = 0; i < g_repeat; i++)
   {
     double density = Eigen::internal::random<double>(0.01,0.5);
@@ -196,35 +197,35 @@ void test_kronecker_product()
     MatrixXf dA(ra,ca), dB(rb,cb), dC;
     initSparse(density, dA, sA);
     initSparse(density, dB, sB);
-    
+
     sC = kroneckerProduct(sA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA.transpose(),sB);
     dC = kroneckerProduct(dA.transpose(),dB);
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA.transpose(),sB.transpose());
     dC = kroneckerProduct(dA.transpose(),dB.transpose());
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA,sB.transpose());
     dC = kroneckerProduct(dA,dB.transpose());
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC2 = kroneckerProduct(sA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(dA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(sA,dB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(2*sA,sB);
     dC = kroneckerProduct(2*dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
@@ -236,11 +237,10 @@ void test_kronecker_product()
 #ifdef EIGEN_TEST_PART_2
 
 // simply check that for a dense kronecker product, sparse module is not needed
-
 #include "main.h"
 #include <Eigen/KroneckerProduct>
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   MatrixXd a(2,2), b(3,3), c;
   a.setRandom();
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 64f168c16..7f9a81cd3 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -1445,7 +1445,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_levenberg_marquardt()
+EIGEN_DECLARE_TEST(levenberg_marquardt)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST(testLmder1());
diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp
index 50dec083d..b032cbf1d 100644
--- a/unsupported/test/matrix_exponential.cpp
+++ b/unsupported/test/matrix_exponential.cpp
@@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol)
   }
 }
 
-void test_matrix_exponential()
+EIGEN_DECLARE_TEST(matrix_exponential)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 7c9b68a3c..6d753737d 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -23,9 +23,8 @@ inline bool test_isApprox_abs(const Type1& a, const Type2& b)
 
 // Returns a matrix with eigenvalues clustered around 0, 1 and 2.
 template<typename MatrixType>
-MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size)
+MatrixType randomMatrixWithRealEivals(const Index size)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   MatrixType diag = MatrixType::Zero(size, size);
@@ -42,16 +41,15 @@ template <typename MatrixType, int IsComplex = NumTraits<typename internal::trai
 struct randomMatrixWithImagEivals
 {
   // Returns a matrix with eigenvalues clustered around 0 and +/- i.
-  static MatrixType run(const typename MatrixType::Index size);
+  static MatrixType run(const Index size);
 };
 
 // Partial specialization for real matrices
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 0>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
   {
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::Scalar Scalar;
     MatrixType diag = MatrixType::Zero(size, size);
     Index i = 0;
@@ -77,9 +75,8 @@ struct randomMatrixWithImagEivals<MatrixType, 0>
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 1>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
   {
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     const Scalar imagUnit(0, 1);
@@ -171,7 +168,6 @@ void testMatrixType(const MatrixType& m)
 {
   // Matrices with clustered eigenvalue lead to different code paths
   // in MatrixFunction.h and are thus useful for testing.
-  typedef typename MatrixType::Index Index;
 
   const Index size = m.rows();
   for (int i = 0; i < g_repeat; i++) {
@@ -181,7 +177,40 @@ void testMatrixType(const MatrixType& m)
   }
 }
 
-void test_matrix_function()
+template<typename MatrixType>
+void testMapRef(const MatrixType& A)
+{
+  // Test if passing Ref and Map objects is possible
+  // (Regression test for Bug #1796)
+  Index size = A.rows();
+  MatrixType X; X.setRandom(size, size);
+  MatrixType Y(size,size);
+  Ref<      MatrixType> R(Y);
+  Ref<const MatrixType> Rc(X);
+  Map<      MatrixType> M(Y.data(), size, size);
+  Map<const MatrixType> Mc(X.data(), size, size);
+
+  X = X*X; // make sure sqrt is possible
+  Y = X.sqrt();
+  R = Rc.sqrt();
+  M = Mc.sqrt();
+  Y = X.exp();
+  R = Rc.exp();
+  M = Mc.exp();
+  X = Y; // make sure log is possible
+  Y = X.log();
+  R = Rc.log();
+  M = Mc.log();
+
+  Y = X.cos() + Rc.cos() + Mc.cos();
+  Y = X.sin() + Rc.sin() + Mc.sin();
+
+  Y = X.cosh() + Rc.cosh() + Mc.cosh();
+  Y = X.sinh() + Rc.sinh() + Mc.sinh();
+}
+
+
+EIGEN_DECLARE_TEST(matrix_function)
 {
   CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
   CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
@@ -190,4 +219,9 @@ void test_matrix_function()
   CALL_SUBTEST_5(testMatrixType(Matrix<double,5,5,RowMajor>()));
   CALL_SUBTEST_6(testMatrixType(Matrix4cd()));
   CALL_SUBTEST_7(testMatrixType(MatrixXd(13,13)));
+
+  CALL_SUBTEST_1(testMapRef(Matrix<float,1,1>()));
+  CALL_SUBTEST_2(testMapRef(Matrix3cf()));
+  CALL_SUBTEST_3(testMapRef(MatrixXf(8,8)));
+  CALL_SUBTEST_7(testMapRef(MatrixXd(13,13)));
 }
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index 7ccfacfdf..dbaf9dbdf 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -19,7 +19,7 @@ void test2dRotation(const T& tol)
   MatrixPower<Matrix<T,2,2> > Apow(A);
 
   for (int i=0; i<=20; ++i) {
-    angle = std::pow(T(10), (i-10) / T(5.));
+    angle = std::pow(T(10), T(i-10) / T(5.));
     c = std::cos(angle);
     s = std::sin(angle);
     B << c, s, -s, c;
@@ -61,7 +61,7 @@ void test3dRotation(const T& tol)
   for (int i=0; i<=20; ++i) {
     v = Matrix<T,3,1>::Random();
     v.normalize();
-    angle = std::pow(T(10), (i-10) / T(5.));
+    angle = std::pow(T(10), T(i-10) / T(5.));
     VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
   }
 }
@@ -150,55 +150,55 @@ typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
 typedef Matrix<long double,3,3>             Matrix3e;
 typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
  
-void test_matrix_power()
+EIGEN_DECLARE_TEST(matrix_power)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
-  CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5f));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
   CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
   CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
-  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f));
   CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
 
   CALL_SUBTEST_10(test3dRotation<double>(1e-13));
-  CALL_SUBTEST_11(test3dRotation<float>(1e-5));
+  CALL_SUBTEST_11(test3dRotation<float>(1e-5f));
   CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
 
   CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testGeneral(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testGeneral(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3)); // see bug 614
+  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3f)); // see bug 614
   CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testSingular(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testSingular(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3f));
   CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testLogThenExp(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3f));
   CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
 }
diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp
index ea541e1ea..034f29217 100644
--- a/unsupported/test/matrix_square_root.cpp
+++ b/unsupported/test/matrix_square_root.cpp
@@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m)
   VERIFY_IS_APPROX(sqrtA * sqrtA, A);
 }
 
-void test_matrix_square_root()
+EIGEN_DECLARE_TEST(matrix_square_root)
 {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 8b300b78a..2eb40fef6 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_minres_T()
 
 }
 
-void test_minres()
+EIGEN_DECLARE_TEST(minres)
 {
   CALL_SUBTEST_1(test_minres_T<double>());
 //  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
deleted file mode 100644
index 8404f1ff8..000000000
--- a/unsupported/test/mpreal/mpreal.h
+++ /dev/null
@@ -1,3104 +0,0 @@
-/*
-    MPFR C++: Multi-precision floating point number class for C++.
-    Based on MPFR library:    http://mpfr.org
-
-    Project homepage:    http://www.holoborodko.com/pavel/mpfr
-    Contact e-mail:      pavel@holoborodko.com
-
-    Copyright (c) 2008-2015 Pavel Holoborodko
-
-    Contributors:
-    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
-    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
-    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
-    Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
-    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
-    Rodney James, Jorge Leitao.
-
-    Licensing:
-    (A) MPFR C++ is under GNU General Public License ("GPL").
-
-    (B) Non-free licenses may also be purchased from the author, for users who
-        do not want their programs protected by the GPL.
-
-        The non-free licenses are for users that wish to use MPFR C++ in
-        their products but are unwilling to release their software
-        under the GPL (which would require them to release source code
-        and allow free redistribution).
-
-        Such users can purchase an unlimited-use license from the author.
-        Contact us for more details.
-
-    GNU General Public License ("GPL") copyright permissions statement:
-    **************************************************************************
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MPREAL_H__
-#define __MPREAL_H__
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <complex>
-#include <algorithm>
-
-// Options
-#define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
-#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
-                                                // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
-                                                // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
-
-// Library version
-#define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 6
-#define MPREAL_VERSION_PATCHLEVEL 2
-#define MPREAL_VERSION_STRING "3.6.2"
-
-// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__)
-    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
-#elif defined(_MSC_VER)                         // Microsoft Visual C++
-    #define IsInf(x) (!_finite(x))
-#else
-    #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
-#endif
-
-// A Clang feature extension to determine compiler features.
-#ifndef __has_feature
-    #define __has_feature(x) 0
-#endif
-
-// Detect support for r-value references (move semantic). Borrowed from Eigen.
-#if (__has_feature(cxx_rvalue_references) || \
-       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
-      (defined(_MSC_VER) && _MSC_VER >= 1600))
-
-    #define MPREAL_HAVE_MOVE_SUPPORT
-
-    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
-    #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
-    #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
-#endif
-
-// Detect support for explicit converters.
-#if (__has_feature(cxx_explicit_conversions) || \
-       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
-       (defined(_MSC_VER) && _MSC_VER >= 1800))
-
-    #define MPREAL_HAVE_EXPLICIT_CONVERTERS
-#endif
-
-#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
-
-#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
-    #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
-    #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
-#else
-    #define MPREAL_MSVC_DEBUGVIEW_CODE
-    #define MPREAL_MSVC_DEBUGVIEW_DATA
-#endif
-
-#include <mpfr.h>
-
-#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
-    #include <cstdlib>                          // Needed for random()
-#endif
-
-// Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal
-                                                // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
-                                                // = -1 disables overflow checks (default)
-
-// Fast replacement for mpfr_set_zero(x, +1):
-// (a) uses low-level data members, might not be compatible with new versions of MPFR
-// (b) sign is not set, add (x)->_mpfr_sign = 1;
-#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
-
-#if defined(__GNUC__)
-  #define MPREAL_PERMISSIVE_EXPR __extension__
-#else
-  #define MPREAL_PERMISSIVE_EXPR
-#endif
-
-namespace mpfr {
-
-class mpreal {
-private:
-    mpfr_t mp;
-
-public:
-
-    // Get default rounding mode & precision
-    inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
-    inline static mp_prec_t  get_default_prec()   {    return mpfr_get_default_prec();                            }
-
-    // Constructors && type conversions
-    mpreal();
-    mpreal(const mpreal& u);
-    mpreal(const mpf_t u);
-    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-
-    // Construct mpreal from mpfr_t structure.
-    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
-    mpreal(const mpfr_t  u, bool shared = false);
-
-    mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-
-    ~mpreal();
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    mpreal& operator=(mpreal&& v);
-    mpreal(mpreal&& u);
-#endif
-
-    // Operations
-    // =
-    // +, -, *, /, ++, --, <<, >>
-    // *=, +=, -=, /=,
-    // <, >, ==, <=, >=
-
-    // =
-    mpreal& operator=(const mpreal& v);
-    mpreal& operator=(const mpf_t v);
-    mpreal& operator=(const mpz_t v);
-    mpreal& operator=(const mpq_t v);
-    mpreal& operator=(const long double v);
-    mpreal& operator=(const double v);
-    mpreal& operator=(const unsigned long int v);
-    mpreal& operator=(const unsigned long long int v);
-    mpreal& operator=(const long long int v);
-    mpreal& operator=(const unsigned int v);
-    mpreal& operator=(const long int v);
-    mpreal& operator=(const int v);
-    mpreal& operator=(const char* s);
-    mpreal& operator=(const std::string& s);
-    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
-
-    // +
-    mpreal& operator+=(const mpreal& v);
-    mpreal& operator+=(const mpf_t v);
-    mpreal& operator+=(const mpz_t v);
-    mpreal& operator+=(const mpq_t v);
-    mpreal& operator+=(const long double u);
-    mpreal& operator+=(const double u);
-    mpreal& operator+=(const unsigned long int u);
-    mpreal& operator+=(const unsigned int u);
-    mpreal& operator+=(const long int u);
-    mpreal& operator+=(const int u);
-
-    mpreal& operator+=(const long long int  u);
-    mpreal& operator+=(const unsigned long long int u);
-    mpreal& operator-=(const long long int  u);
-    mpreal& operator-=(const unsigned long long int u);
-    mpreal& operator*=(const long long int  u);
-    mpreal& operator*=(const unsigned long long int u);
-    mpreal& operator/=(const long long int  u);
-    mpreal& operator/=(const unsigned long long int u);
-
-    const mpreal operator+() const;
-    mpreal& operator++ ();
-    const mpreal  operator++ (int);
-
-    // -
-    mpreal& operator-=(const mpreal& v);
-    mpreal& operator-=(const mpz_t v);
-    mpreal& operator-=(const mpq_t v);
-    mpreal& operator-=(const long double u);
-    mpreal& operator-=(const double u);
-    mpreal& operator-=(const unsigned long int u);
-    mpreal& operator-=(const unsigned int u);
-    mpreal& operator-=(const long int u);
-    mpreal& operator-=(const int u);
-    const mpreal operator-() const;
-    friend const mpreal operator-(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator-(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator-(const long int b,          const mpreal& a);
-    friend const mpreal operator-(const int b,               const mpreal& a);
-    friend const mpreal operator-(const double b,            const mpreal& a);
-    mpreal& operator-- ();
-    const mpreal  operator-- (int);
-
-    // *
-    mpreal& operator*=(const mpreal& v);
-    mpreal& operator*=(const mpz_t v);
-    mpreal& operator*=(const mpq_t v);
-    mpreal& operator*=(const long double v);
-    mpreal& operator*=(const double v);
-    mpreal& operator*=(const unsigned long int v);
-    mpreal& operator*=(const unsigned int v);
-    mpreal& operator*=(const long int v);
-    mpreal& operator*=(const int v);
-
-    // /
-    mpreal& operator/=(const mpreal& v);
-    mpreal& operator/=(const mpz_t v);
-    mpreal& operator/=(const mpq_t v);
-    mpreal& operator/=(const long double v);
-    mpreal& operator/=(const double v);
-    mpreal& operator/=(const unsigned long int v);
-    mpreal& operator/=(const unsigned int v);
-    mpreal& operator/=(const long int v);
-    mpreal& operator/=(const int v);
-    friend const mpreal operator/(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator/(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator/(const long int b,          const mpreal& a);
-    friend const mpreal operator/(const int b,               const mpreal& a);
-    friend const mpreal operator/(const double b,            const mpreal& a);
-
-    //<<= Fast Multiplication by 2^u
-    mpreal& operator<<=(const unsigned long int u);
-    mpreal& operator<<=(const unsigned int u);
-    mpreal& operator<<=(const long int u);
-    mpreal& operator<<=(const int u);
-
-    //>>= Fast Division by 2^u
-    mpreal& operator>>=(const unsigned long int u);
-    mpreal& operator>>=(const unsigned int u);
-    mpreal& operator>>=(const long int u);
-    mpreal& operator>>=(const int u);
-
-    // Type Conversion operators
-    bool               toBool      (                        )    const;
-    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
-    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
-    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
-    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
-
-#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator bool               () const { return toBool();                 }
-    explicit operator int                () const { return int(toLong());            }
-    explicit operator long               () const { return toLong();                 }
-    explicit operator long long          () const { return toLLong();                }
-    explicit operator unsigned           () const { return unsigned(toULong());      }
-    explicit operator unsigned long      () const { return toULong();                }
-    explicit operator unsigned long long () const { return toULLong();               }
-    explicit operator float              () const { return toFloat();                }
-    explicit operator double             () const { return toDouble();               }
-    explicit operator long double        () const { return toLDouble();              }
-#endif
-
-    // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
-    ::mpfr_ptr    mpfr_ptr();
-    ::mpfr_srcptr mpfr_ptr()    const;
-    ::mpfr_srcptr mpfr_srcptr() const;
-
-    // Convert mpreal to string with n significant digits in base b
-    // n = -1 -> convert with the maximum available digits
-    std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    std::string toString(const std::string& format) const;
-#endif
-
-    std::ostream& output(std::ostream& os) const;
-
-    // Math Functions
-    friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
-    friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend int cmpabs(const mpreal& a,const mpreal& b);
-
-    friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
-    friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal acos  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asin  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
-    friend const mpreal acot  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asec  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsc  (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cosh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sinh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tanh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sech  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csch  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal coth  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    friend const mpreal fac_ui (unsigned long int v,  mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
-    friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
-    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
-    friend int sgn(const mpreal& v); // returns -1 or +1
-
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    friend int          sinh_cosh   (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal li2         (const mpreal& v,                       mp_rnd_t rnd_mode);
-    friend const mpreal fmod        (const mpreal& x, const mpreal& y,      mp_rnd_t rnd_mode);
-    friend const mpreal rec_sqrt    (const mpreal& v,                       mp_rnd_t rnd_mode);
-
-    // MATLAB's semantic equivalents
-    friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
-    friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal grandom (unsigned int seed);
-#endif
-
-    // Uniformly distributed random number generation in [0,1] using
-    // Mersenne-Twister algorithm by default.
-    // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
-    // Check urandom() for more precise control.
-    friend const mpreal random(unsigned int seed);
-
-    // Splits mpreal value into fractional and integer parts.
-    // Returns fractional part and stores integer part in n.
-    friend const mpreal modf(const mpreal& v, mpreal& n);
-
-    // Constants
-    // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
-    friend const mpreal const_log2      (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_pi        (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_euler     (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_catalan   (mp_prec_t prec, mp_rnd_t rnd_mode);
-
-    // returns +inf iff sign>=0 otherwise -inf
-    friend const mpreal const_infinity(int sign, mp_prec_t prec);
-
-    // Output/ Input
-    friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
-    friend std::istream& operator>>(std::istream& is, mpreal& v);
-
-    // Integer Related Functions
-    friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal ceil (const mpreal& v);
-    friend const mpreal floor(const mpreal& v);
-    friend const mpreal round(const mpreal& v);
-    friend const mpreal trunc(const mpreal& v);
-    friend const mpreal rint_ceil   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_floor  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_round  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_trunc  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    // Miscellaneous Functions
-    friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
-    friend const mpreal nextabove  (const mpreal& x);
-    friend const mpreal nextbelow  (const mpreal& x);
-
-    // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal urandomb (gmp_randstate_t& state);
-
-// MPFR < 2.4.2 Specifics
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-    friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
-#endif
-
-    // Instance Checkers
-    friend bool (isnan)    (const mpreal& v);
-    friend bool (isinf)    (const mpreal& v);
-    friend bool (isfinite) (const mpreal& v);
-
-    friend bool isnum    (const mpreal& v);
-    friend bool iszero   (const mpreal& v);
-    friend bool isint    (const mpreal& v);
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend bool isregular(const mpreal& v);
-#endif
-
-    // Set/Get instance properties
-    inline mp_prec_t    get_prec() const;
-    inline void         set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd());    // Change precision with rounding mode
-
-    // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
-    inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
-    inline int          getPrecision() const;
-
-    // Set mpreal to +/- inf, NaN, +/-0
-    mpreal&        setInf  (int Sign = +1);
-    mpreal&        setNan  ();
-    mpreal&        setZero (int Sign = +1);
-    mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
-
-    //Exponent
-    mp_exp_t get_exp();
-    int set_exp(mp_exp_t e);
-    int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
-    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
-
-    // Inexact conversion from float
-    inline bool fits_in_bits(double x, int n);
-
-    // Set/Get global properties
-    static void            set_default_prec(mp_prec_t prec);
-    static void            set_default_rnd(mp_rnd_t rnd_mode);
-
-    static mp_exp_t  get_emin (void);
-    static mp_exp_t  get_emax (void);
-    static mp_exp_t  get_emin_min (void);
-    static mp_exp_t  get_emin_max (void);
-    static mp_exp_t  get_emax_min (void);
-    static mp_exp_t  get_emax_max (void);
-    static int       set_emin (mp_exp_t exp);
-    static int       set_emax (mp_exp_t exp);
-
-    // Efficient swapping of two mpreal values - needed for std algorithms
-    friend void swap(mpreal& x, mpreal& y);
-
-    friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-private:
-    // Human friendly Debug Preview in Visual Studio.
-    // Put one of these lines:
-    //
-    // mpfr::mpreal=<DebugView>                              ; Show value only
-    // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
-    //
-    // at the beginning of
-    // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
-    MPREAL_MSVC_DEBUGVIEW_DATA
-
-    // "Smart" resources deallocation. Checks if instance initialized before deletion.
-    void clear(::mpfr_ptr);
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Exceptions
-class conversion_overflow : public std::exception {
-public:
-    std::string why() { return "inexact conversion from floating point"; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Constructors & converters
-// Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal()
-{
-    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
-    mpfr_set_zero_fast(mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpreal& u)
-{
-    mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
-    mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-inline mpreal::mpreal(mpreal&& other)
-{
-    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pointer to actual data
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal& mpreal::operator=(mpreal&& other)
-{
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-#endif
-
-inline mpreal::mpreal(const mpfr_t  u, bool shared)
-{
-    if(shared)
-    {
-        std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
-    }
-    else
-    {
-        mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
-        mpfr_set  (mpfr_ptr(), u, mpreal::get_default_rnd());
-    }
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpf_t u)
-{
-    mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
-    mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_z(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_q(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
-{
-     mpfr_init2(mpfr_ptr(), prec);
-
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-  if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
-  {
-    mpfr_set_d(mpfr_ptr(), u, mode);
-  }else
-    throw conversion_overflow();
-#else
-  mpfr_set_d(mpfr_ptr(), u, mode);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ld(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_uj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_sj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s, base, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline void mpreal::clear(::mpfr_ptr x)
-{
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    if(mpfr_is_initialized(x))
-#endif
-    mpfr_clear(x);
-}
-
-inline mpreal::~mpreal()
-{
-    clear(mpfr_ptr());
-}
-
-// internal namespace needed for template magic
-namespace internal{
-
-    // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
-    // This is needed for smooth integration with libraries based on expression templates, like Eigen.
-    // TODO: Do the same for boolean operators.
-    template <typename ArgumentType> struct result_type {};
-
-    template <> struct result_type<mpreal>              {typedef mpreal type;};
-    template <> struct result_type<mpz_t>               {typedef mpreal type;};
-    template <> struct result_type<mpq_t>               {typedef mpreal type;};
-    template <> struct result_type<long double>         {typedef mpreal type;};
-    template <> struct result_type<double>              {typedef mpreal type;};
-    template <> struct result_type<unsigned long int>   {typedef mpreal type;};
-    template <> struct result_type<unsigned int>        {typedef mpreal type;};
-    template <> struct result_type<long int>            {typedef mpreal type;};
-    template <> struct result_type<int>                 {typedef mpreal type;};
-    template <> struct result_type<long long>           {typedef mpreal type;};
-    template <> struct result_type<unsigned long long>  {typedef mpreal type;};
-}
-
-// + Addition
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    }
-
-// - Subtraction
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
-
-// * Multiplication
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    }
-
-// / Division
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
-
-//////////////////////////////////////////////////////////////////////////
-// sqrt
-const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-// abs
-inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// pow
-const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// Estimate machine epsilon for the given precision
-// Returns smallest eps such that 1.0 + eps != 1.0
-inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
-
-// Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);
-
-// Gives max & min values for the required precision,
-// minval is 'safe' meaning 1 / minval does not overflow
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
-inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
-
-// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
-
-// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
-
-// 'Bitwise' equality check
-//  maxUlps - a and b can be apart by maxUlps binary numbers.
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
-
-//////////////////////////////////////////////////////////////////////////
-// Convert precision in 'bits' to decimal digits and vice versa.
-//    bits   = ceil(digits*log[2](10))
-//    digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d);
-inline int       bits2digits(mp_prec_t b);
-
-//////////////////////////////////////////////////////////////////////////
-// min, max
-const mpreal (max)(const mpreal& x, const mpreal& y);
-const mpreal (min)(const mpreal& x, const mpreal& y);
-
-//////////////////////////////////////////////////////////////////////////
-// Implementation
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-// Operators - Assignment
-inline mpreal& mpreal::operator=(const mpreal& v)
-{
-    if (this != &v)
-    {
-    mp_prec_t tp = mpfr_get_prec(  mpfr_srcptr());
-    mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
-
-    if(tp != vp){
-      clear(mpfr_ptr());
-      mpfr_init2(mpfr_ptr(), vp);
-    }
-
-        mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpf_t v)
-{
-    mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpz_t v)
-{
-    mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpq_t v)
-{
-    mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long double v)
-{
-    mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const double v)
-{
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-  if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
-  {
-    mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-  }else
-    throw conversion_overflow();
-#else
-  mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-#endif
-
-  MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long int v)
-{
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned int v)
-{
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long long int v)
-{
-    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long long int v)
-{
-    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long int v)
-{
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const int v)
-{
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const char* s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const std::string& s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-template <typename real_t>
-inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
-{
-    return *this = z.real();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// + Addition
-inline mpreal& mpreal::operator+=(const mpreal& v)
-{
-    mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpf_t u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpz_t u)
-{
-    mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpq_t u)
-{
-    mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const long double u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const double u)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-#else
-    *this += mpreal(u);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned long int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-
-inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
-
-inline const mpreal operator+(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline mpreal& mpreal::operator++()
-{
-    return *this += 1;
-}
-
-inline const mpreal mpreal::operator++ (int)
-{
-    mpreal x(*this);
-    *this += 1;
-    return x;
-}
-
-inline mpreal& mpreal::operator--()
-{
-    return *this -= 1;
-}
-
-inline const mpreal mpreal::operator-- (int)
-{
-    mpreal x(*this);
-    *this -= 1;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// - Subtraction
-inline mpreal& mpreal::operator-=(const mpreal& v)
-{
-    mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpz_t v)
-{
-    mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpq_t v)
-{
-    mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long double v)
-{
-    *this -= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this -= mpreal(v);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned long int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal mpreal::operator-()const
-{
-    mpreal u(*this);
-    mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-    return u;
-}
-
-inline const mpreal operator-(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline const mpreal operator-(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
-    x -= a;
-    return x;
-#endif
-}
-
-inline const mpreal operator-(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// * Multiplication
-inline mpreal& mpreal::operator*= (const mpreal& v)
-{
-    mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpz_t v)
-{
-    mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpq_t v)
-{
-    mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long double v)
-{
-    *this *= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this *= mpreal(v);
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned long int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator*(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// / Division
-inline mpreal& mpreal::operator/=(const mpreal& v)
-{
-    mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpz_t v)
-{
-    mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpq_t v)
-{
-    mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long double v)
-{
-    *this /= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this /= mpreal(v);
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned long int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator/(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
-  mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline const mpreal operator/(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    x /= a;
-    return x;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Shifts operators - Multiplication/Division by power of 2
-inline mpreal& mpreal::operator<<=(const unsigned long int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const unsigned int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const long int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned long int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const long int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
-{
-    return mul_2ui(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned int k)
-{
-    return mul_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator<<(const mpreal& v, const long int k)
-{
-    return mul_2si(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const int k)
-{
-    return mul_2si(v,static_cast<long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
-{
-    return div_2ui(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const long int k)
-{
-    return div_2si(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned int k)
-{
-    return div_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const int k)
-{
-    return div_2si(v,static_cast<long int>(k));
-}
-
-// mul_2ui
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-// mul_2si
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//Relational operators
-
-// WARNING:
-//
-// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
-//
-// isnan(b) =  (b != b)
-// isnan(b) = !(b == b)  (we use in code below)
-//
-// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
-// Use std::isnan instead (C++11).
-
-inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
-inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
-inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
-
-inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
-inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-// inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
-inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
-
-inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
-inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
-inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
-
-inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
-inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
-inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
-
-inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
-inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
-inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
-
-inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
-
-inline bool (isnan)    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isfinite) (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
-inline bool iszero   (const mpreal& op){    return (mpfr_zero_p   (op.mpfr_srcptr()) != 0 );    }
-inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcptr()) != 0 );    }
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Type Converters
-inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
-inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
-inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
-inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
-inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
-inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
-inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
-inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
-
-inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_srcptr() const    { return mp; }
-
-template <class T>
-inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
-{
-    std::ostringstream oss;
-    oss << f << t;
-    return oss.str();
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline std::string mpreal::toString(const std::string& format) const
-{
-    char *s = NULL;
-    std::string out;
-
-    if( !format.empty() )
-    {
-        if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
-        {
-            out = std::string(s);
-
-            mpfr_free_str(s);
-        }
-    }
-
-    return out;
-}
-
-#endif
-
-inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
-{
-    // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
-    (void)b;
-    (void)mode;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-    std::ostringstream format;
-
-    int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
-
-    format << "%." << digits << "RNg";
-
-    return toString(format.str());
-
-#else
-
-    char *s, *ns = NULL;
-    size_t slen, nslen;
-    mp_exp_t exp;
-    std::string out;
-
-    if(mpfr_inf_p(mp))
-    {
-        if(mpfr_sgn(mp)>0) return "+Inf";
-        else               return "-Inf";
-    }
-
-    if(mpfr_zero_p(mp)) return "0";
-    if(mpfr_nan_p(mp))  return "NaN";
-
-    s  = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
-    ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
-
-    if(s!=NULL && ns!=NULL)
-    {
-        slen  = strlen(s);
-        nslen = strlen(ns);
-        if(nslen<=slen)
-        {
-            mpfr_free_str(s);
-            s = ns;
-            slen = nslen;
-        }
-        else {
-            mpfr_free_str(ns);
-        }
-
-        // Make human eye-friendly formatting if possible
-        if (exp>0 && static_cast<size_t>(exp)<slen)
-        {
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp) ptr--;
-
-                if(ptr==s+exp) out = std::string(s,exp+1);
-                else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
-
-                //out = string(s,exp+1)+'.'+string(s+exp+1);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp-1) ptr--;
-
-                if(ptr==s+exp-1) out = std::string(s,exp);
-                else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
-
-                //out = string(s,exp)+'.'+string(s+exp);
-            }
-
-        }else{ // exp<0 || exp>slen
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+1) ptr--;
-
-                if(ptr==s+1) out = std::string(s,2);
-                else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
-
-                //out = string(s,2)+'.'+string(s+2);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s) ptr--;
-
-                if(ptr==s) out = std::string(s,1);
-                else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
-
-                //out = string(s,1)+'.'+string(s+1);
-            }
-
-            // Make final string
-            if(--exp)
-            {
-                if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
-                else       out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
-            }
-        }
-
-        mpfr_free_str(s);
-        return out;
-    }else{
-        return "conversion error!";
-    }
-#endif
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-// I/O
-inline std::ostream& mpreal::output(std::ostream& os) const
-{
-    std::ostringstream format;
-    const std::ios::fmtflags flags = os.flags();
-
-    format << ((flags & std::ios::showpos) ? "%+" : "%");
-    if (os.precision() >= 0)
-        format << '.' << os.precision() << "R*"
-               << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
-                   (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
-                   'g');
-    else
-        format << "R*e";
-
-    char *s = NULL;
-    if(!(mpfr_asprintf(&s, format.str().c_str(),
-                        mpfr::mpreal::get_default_rnd(),
-                        mpfr_srcptr())
-        < 0))
-    {
-        os << std::string(s);
-        mpfr_free_str(s);
-    }
-    return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
-{
-    return v.output(os);
-}
-
-inline std::istream& operator>>(std::istream &is, mpreal& v)
-{
-    // TODO: use cout::hexfloat and other flags to setup base
-    std::string tmp;
-    is >> tmp;
-    mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
-    return is;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//     Bits - decimal digits relation
-//        bits   = ceil(digits*log[2](10))
-//        digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d)
-{
-    const double LOG2_10 = 3.3219280948873624;
-
-    return mp_prec_t(std::ceil( d * LOG2_10 ));
-}
-
-inline int bits2digits(mp_prec_t b)
-{
-    const double LOG10_2 = 0.30102999566398119;
-
-    return int(std::floor( b * LOG10_2 ));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get number properties
-inline int sgn(const mpreal& op)
-{
-    return mpfr_sgn(op.mpfr_srcptr());
-}
-
-inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
-{
-    mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), (sign < 0 ? 1 : 0), RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline int mpreal::getPrecision() const
-{
-    return int(mpfr_get_prec(mpfr_srcptr()));
-}
-
-inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
-{
-    mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setInf(int sign)
-{
-    mpfr_set_inf(mpfr_ptr(), sign);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setNan()
-{
-    mpfr_set_nan(mpfr_ptr());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setZero(int sign)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    mpfr_set_zero(mpfr_ptr(), sign);
-#else
-    mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
-    setSign(sign);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mp_prec_t mpreal::get_prec() const
-{
-    return mpfr_get_prec(mpfr_srcptr());
-}
-
-inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
-{
-    mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mp_exp_t mpreal::get_exp ()
-{
-    return mpfr_get_exp(mpfr_srcptr());
-}
-
-inline int mpreal::set_exp (mp_exp_t e)
-{
-    int x = mpfr_set_exp(mpfr_ptr(), e);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return x;
-}
-
-inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpreal y(x);
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
-#else
-    *exp = mpfr_get_exp(y.mpfr_srcptr());
-    mpfr_set_exp(y.mpfr_ptr(),0);
-#endif
-    return y;
-}
-
-inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
-{
-    mpreal x(v);
-
-    // rounding is not important since we are just increasing the exponent (= exact operation)
-    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
-{
-    return ldexp(v, exp);
-}
-
-inline mpreal machine_epsilon(mp_prec_t prec)
-{
-    /* the smallest eps such that 1 + eps != 1 */
-    return machine_epsilon(mpreal(1, prec));
-}
-
-inline mpreal machine_epsilon(const mpreal& x)
-{
-    /* the smallest eps such that x + eps != x */
-    if( x < 0)
-    {
-        return nextabove(-x) + x;
-    }else{
-        return nextabove( x) - x;
-    }
-}
-
-// minval is 'safe' meaning 1 / minval does not overflow
-inline mpreal minval(mp_prec_t prec)
-{
-    /* min = 1/2 * 2^emin = 2^(emin - 1) */
-    return mpreal(1, prec) << mpreal::get_emin()-1;
-}
-
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal maxval(mp_prec_t prec)
-{
-    /* max = (1 - eps) * 2^emax, eps is machine epsilon */
-    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
-}
-
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
-{
-    return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
-{
-    return abs(a - b) <= eps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
-{
-    return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// C++11 sign functions.
-inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
-    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
-    return rop;
-}
-
-inline bool signbit(const mpreal& x)
-{
-    return mpfr_signbit(x.mpfr_srcptr());
-}
-
-inline const mpreal modf(const mpreal& v, mpreal& n)
-{
-    mpreal f(v);
-
-    // rounding is not important since we are using the same number
-    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
-    mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
-    return f;
-}
-
-inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
-{
-    return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
-}
-
-inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
-{
-    int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return r;
-}
-
-inline mp_exp_t mpreal::get_emin (void)
-{
-    return mpfr_get_emin();
-}
-
-inline int mpreal::set_emin (mp_exp_t exp)
-{
-    return mpfr_set_emin(exp);
-}
-
-inline mp_exp_t mpreal::get_emax (void)
-{
-    return mpfr_get_emax();
-}
-
-inline int mpreal::set_emax (mp_exp_t exp)
-{
-    return mpfr_set_emax(exp);
-}
-
-inline mp_exp_t mpreal::get_emin_min (void)
-{
-    return mpfr_get_emin_min();
-}
-
-inline mp_exp_t mpreal::get_emin_max (void)
-{
-    return mpfr_get_emin_max();
-}
-
-inline mp_exp_t mpreal::get_emax_min (void)
-{
-    return mpfr_get_emax_min();
-}
-
-inline mp_exp_t mpreal::get_emax_max (void)
-{
-    return mpfr_get_emax_max();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Mathematical Functions
-//////////////////////////////////////////////////////////////////////////
-#define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
-        mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
-        mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
-        return y;
-
-inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
-
-inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt);    }
-
-inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
-{
-    mpreal y;
-    mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
-    return y;
-}
-
-inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
-{
-    return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-}
-
-inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
-    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
-    return y;
-}
-
-inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
-    return y;
-}
-
-inline int cmpabs(const mpreal& a,const mpreal& b)
-{
-    return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
-}
-
-inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
-}
-
-inline const mpreal sqrt  (const long double v, mp_rnd_t rnd_mode)    {   return sqrt(mpreal(v),rnd_mode);    }
-inline const mpreal sqrt  (const double v, mp_rnd_t rnd_mode)         {   return sqrt(mpreal(v),rnd_mode);    }
-
-inline const mpreal cbrt  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt );    }
-inline const mpreal fabs  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal abs   (const mpreal& x, mp_rnd_t r)                             {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal log   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log  );    }
-inline const mpreal log2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log2 );    }
-inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log10);    }
-inline const mpreal exp   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp  );    }
-inline const mpreal exp2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 );    }
-inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp10);    }
-inline const mpreal cos   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cos  );    }
-inline const mpreal sin   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sin  );    }
-inline const mpreal tan   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tan  );    }
-inline const mpreal sec   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sec  );    }
-inline const mpreal csc   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csc  );    }
-inline const mpreal cot   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cot  );    }
-inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acos );    }
-inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
-inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
-
-inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
-
-inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
-inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
-inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
-inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atanh(1/v, r);                      }
-inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acosh(1/v, r);                      }
-inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asinh(1/v, r);                      }
-
-inline const mpreal cosh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cosh );    }
-inline const mpreal sinh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sinh );    }
-inline const mpreal tanh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tanh );    }
-inline const mpreal sech  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sech );    }
-inline const mpreal csch  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csch );    }
-inline const mpreal coth  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(coth );    }
-inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acosh);    }
-inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asinh);    }
-inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atanh);    }
-
-inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log1p  );    }
-inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
-inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
-inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
-inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
-inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
-inline const mpreal erfc    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erfc   );    }
-inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j0     );    }
-inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j1     );    }
-inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y0     );    }
-inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y1     );    }
-
-inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec     = mpreal::get_default_prec(),
-                                           mp_rnd_t  rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(0, prec);
-    mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
-    return x;
-}
-
-
-inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    int tsignp;
-
-    if(signp)   mpfr_lgamma(x.mpfr_ptr(),  signp,v.mpfr_srcptr(),rnd_mode);
-    else        mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
-
-    return x;
-}
-
-
-inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-    p3 = v3.get_prec();
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-    p3 = v3.get_prec();
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-
-    a.set_prec(p1>p2?p1:p2);
-
-    mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpfr_srcptr *p = new mpfr_srcptr[n];
-
-    for (unsigned long int  i = 0; i < n; i++)
-        p[i] = tab[i].mpfr_srcptr();
-
-    mpreal x;
-    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
-
-    delete [] p;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
-}
-
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
-}
-
-inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    /*  R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
-    return fmod(x, y, rnd_mode);
-}
-
-inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    (void)rnd_mode;
-
-    /*
-
-    m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
-
-    The following are true by convention:
-    - mod(x,0) is x
-    - mod(x,x) is 0
-    - mod(x,y) for x != y and y != 0 has the same sign as y.
-
-    */
-
-    if(iszero(y)) return x;
-    if(x == y) return 0;
-
-    mpreal m = x - floor(x / y) * y;
-
-    m.setSign(sgn(y)); // make sure result has the same sign as Y
-
-    return m;
-}
-
-inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t yp, xp;
-
-    yp = y.get_prec();
-    xp = x.get_prec();
-
-    a.set_prec(yp>xp?yp:xp);
-
-    mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
-    return x;
-}
-#endif //  MPFR 2.4.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 3.0.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(digamma);     }
-inline const mpreal ai      (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(ai);          }
-#endif // MPFR 3.0.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// Constants
-inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_log2(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_pi(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_euler(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_catalan(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
-{
-    mpreal x(0, p);
-    mpfr_set_inf(x.mpfr_ptr(), sign);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Integer Related Functions
-inline const mpreal ceil(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_ceil(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal floor(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_floor(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal round(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_round(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal trunc(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_trunc(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal rint       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint      );     }
-inline const mpreal rint_ceil  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil );     }
-inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor);     }
-inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round);     }
-inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc);     }
-inline const mpreal frac       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(frac      );     }
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline void         swap (mpreal& a, mpreal& b)            {    mpfr_swap(a.mp,b.mp);   }
-inline const mpreal (max)(const mpreal& x, const mpreal& y){    return (x>y?x:y);       }
-inline const mpreal (min)(const mpreal& x, const mpreal& y){    return (x<y?x:y);       }
-
-inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fmin(const mpreal& x, const mpreal& y,  mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
-{
-    mpreal a(x);
-    mpfr_nexttoward(a.mp,y.mp);
-    return a;
-}
-
-inline const mpreal nextabove  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextabove(a.mp);
-    return a;
-}
-
-inline const mpreal nextbelow  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextbelow(a.mp);
-    return a;
-}
-
-inline const mpreal urandomb (gmp_randstate_t& state)
-{
-    mpreal x;
-    mpfr_urandomb(x.mpfr_ptr(),state);
-    return x;
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
-    return x;
-}
-#endif
-
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
-{
-    mpreal x;
-    mpfr_random2(x.mpfr_ptr(),size,exp);
-    return x;
-}
-#endif
-
-// Uniformly distributed random number generation
-// a = random(seed); <- initialization & first random number generation
-// a = random();     <- next random numbers generation
-// seed != 0
-inline const mpreal random(unsigned int seed = 0)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0)    gmp_randseed_ui(state,seed);
-
-    return mpfr::urandom(state);
-#else
-    if(seed != 0)    std::srand(seed);
-    return mpfr::mpreal(std::rand()/(double)RAND_MAX);
-#endif
-
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
-    return x;
-}
-
-inline const mpreal grandom(unsigned int seed = 0)
-{
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0) gmp_randseed_ui(state,seed);
-
-    return mpfr::grandom(state);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get global properties
-inline void mpreal::set_default_prec(mp_prec_t prec)
-{
-    mpfr_set_default_prec(prec);
-}
-
-inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{
-    mpfr_set_default_rounding_mode(rnd_mode);
-}
-
-inline bool mpreal::fits_in_bits(double x, int n)
-{
-    int i;
-    double t;
-    return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
-}
-
-inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-// pow unsigned long int
-inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    mpreal x(a);
-    mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow unsigned int
-inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow long int
-inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow int
-inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow long double
-inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-} // End of mpfr namespace
-
-// Explicit specialization of std::swap for mpreal numbers
-// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
-// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
-namespace std
-{
-  // we are allowed to extend namespace std with specializations only
-    template <>
-    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
-    {
-        return mpfr::swap(x, y);
-    }
-
-    template<>
-    class numeric_limits<mpfr::mpreal>
-    {
-    public:
-        static const bool is_specialized    = true;
-        static const bool is_signed         = true;
-        static const bool is_integer        = false;
-        static const bool is_exact          = false;
-        static const int  radix             = 2;
-
-        static const bool has_infinity      = true;
-        static const bool has_quiet_NaN     = true;
-        static const bool has_signaling_NaN = true;
-
-        static const bool is_iec559         = true;        // = IEEE 754
-        static const bool is_bounded        = true;
-        static const bool is_modulo         = false;
-        static const bool traps             = true;
-        static const bool tinyness_before   = true;
-
-        static const float_denorm_style has_denorm  = denorm_absent;
-
-        inline static mpfr::mpreal (min)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::minval(precision);  }
-        inline static mpfr::mpreal (max)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::maxval(precision);  }
-        inline static mpfr::mpreal lowest   (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return -mpfr::maxval(precision);  }
-
-        // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
-        inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
-
-        // Returns smallest eps such that x + eps != x (relative machine epsilon)
-        inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
-
-        inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision);
-            else               return mpfr::mpreal(1.0, precision);
-        }
-
-        inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
-        inline static const mpfr::mpreal quiet_NaN()        { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal signaling_NaN()    { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal denorm_min()       { return (min)();                    }
-
-        // Please note, exponent range is not fixed in MPFR
-        static const int min_exponent = MPFR_EMIN_DEFAULT;
-        static const int max_exponent = MPFR_EMAX_DEFAULT;
-        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
-        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
-
-#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
-
-        // Following members should be constant according to standard, but they can be variable in MPFR
-        // So we define them as functions here.
-        //
-        // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
-        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
-        // See below for compatible implementation.
-        inline static float_round_style round_style()
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            switch (r)
-            {
-            case GMP_RNDN: return round_to_nearest;
-            case GMP_RNDZ: return round_toward_zero;
-            case GMP_RNDU: return round_toward_infinity;
-            case GMP_RNDD: return round_toward_neg_infinity;
-            default: return round_indeterminate;
-            }
-        }
-
-        inline static int digits()                        {    return int(mpfr::mpreal::get_default_prec());    }
-        inline static int digits(const mpfr::mpreal& x)   {    return x.getPrecision();                         }
-
-        inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return mpfr::bits2digits(precision);
-        }
-
-        inline static int digits10(const mpfr::mpreal& x)
-        {
-            return mpfr::bits2digits(x.getPrecision());
-        }
-
-        inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return digits10(precision);
-        }
-#else
-        // Digits and round_style are NOT constants when it comes to mpreal.
-        // If possible, please use functions digits() and round_style() defined above.
-        //
-        // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
-        // Change them accordingly to your application.
-        //
-        // For example, if you use 256 bits of precision uniformly in your program, then:
-        // digits       = 256
-        // digits10     = 77
-        // max_digits10 = 78
-        //
-        // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
-
-        static const std::float_round_style round_style = round_to_nearest;
-        static const int digits       = 53;
-        static const int digits10     = 15;
-        static const int max_digits10 = 16;
-#endif
-    };
-
-}
-
-#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 685e7ea45..10beb0714 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -1,3 +1,4 @@
+#include <mpreal.h>  // Must be included before main.h.
 #include "main.h"
 #include <Eigen/MPRealSupport>
 #include <Eigen/LU>
@@ -7,7 +8,7 @@
 using namespace mpfr;
 using namespace Eigen;
 
-void test_mpreal_support()
+EIGEN_DECLARE_TEST(mpreal_support)
 {
   // set precision to 256 bits (double has only 53 bits)
   mpreal::set_default_prec(256);
diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp
index 706a816f7..1c4438134 100644
--- a/unsupported/test/openglsupport.cpp
+++ b/unsupported/test/openglsupport.cpp
@@ -9,15 +9,24 @@
 
 #include <main.h>
 #include <iostream>
+#include <string>
+
+#if defined(__APPLE_CC__)
+  // Prevent deprecation warnings caused by GLEW on MacOS.
+  #define GL_SILENCE_DEPRECATION 1
+#endif
 #include <GL/glew.h>
 #include <Eigen/OpenGLSupport>
-#include <GL/glut.h>
-using namespace Eigen;
-
-
+#if defined(__APPLE_CC__)
+  #include <GLUT/glut.h>
+#else
+  #include <GL/freeglut.h>
+#endif
 
+using namespace Eigen;
 
 #define VERIFY_MATRIX(CODE,REF) { \
+    glMatrixMode(GL_MODELVIEW); \
     glLoadIdentity(); \
     CODE; \
     Matrix<float,4,4,ColMajor> m; m.setZero(); \
@@ -40,7 +49,7 @@ using namespace Eigen;
     } \
     VERIFY_IS_APPROX(value, data); \
   }
-  
+
 #define VERIFY_UNIFORMi(NAME,TYPE) { \
     TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \
     TYPE data; \
@@ -53,175 +62,324 @@ using namespace Eigen;
     } \
     VERIFY_IS_APPROX(value, data); \
   }
-  
-void printInfoLog(GLuint objectID)
+
+void printProgramInfoLog(GLuint objectID)
 {
     int infologLength, charsWritten;
     GLchar *infoLog;
-    glGetProgramiv(objectID,GL_INFO_LOG_LENGTH, &infologLength);
+    glGetProgramiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
     if(infologLength > 0)
     {
         infoLog = new GLchar[infologLength];
         glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog);
-        if (charsWritten>0)
+        if (charsWritten > 0)
+          std::cerr << "Program info : \n" << infoLog << std::endl;
+        delete[] infoLog;
+    }
+}
+
+void printShaderInfoLog(GLuint objectID)
+{
+    int infologLength, charsWritten;
+    GLchar *infoLog;
+    glGetShaderiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
+    if(infologLength > 0)
+    {
+        infoLog = new GLchar[infologLength];
+        glGetShaderInfoLog(objectID, infologLength, &charsWritten, infoLog);
+        if (charsWritten > 0)
           std::cerr << "Shader info : \n" << infoLog << std::endl;
         delete[] infoLog;
     }
 }
 
-GLint createShader(const char* vtx, const char* frg)
+GLint createProgram(const char* vtx, const char* frg, bool print_errors = true)
 {
   GLint prg_id = glCreateProgram();
   GLint vtx_id = glCreateShader(GL_VERTEX_SHADER);
   GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER);
   GLint ok;
-  
+
   glShaderSource(vtx_id, 1, &vtx, 0);
   glCompileShader(vtx_id);
-  glGetShaderiv(vtx_id,GL_COMPILE_STATUS,&ok);
+  glGetShaderiv(vtx_id, GL_COMPILE_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "vtx compilation failed\n";
+    if (print_errors)
+    {
+      std::cerr << "vtx compilation failed\n";
+      std::cerr << "Source:\n" << vtx << "\n";
+      printShaderInfoLog(vtx_id);
+    }
+    glDeleteShader(vtx_id);
+    return GL_ZERO;
   }
-  
+
   glShaderSource(frg_id, 1, &frg, 0);
   glCompileShader(frg_id);
-  glGetShaderiv(frg_id,GL_COMPILE_STATUS,&ok);
+  glGetShaderiv(frg_id, GL_COMPILE_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "frg compilation failed\n";
+    if (print_errors)
+    {
+      std::cerr << "frg compilation failed.\n";
+      std::cerr << "Source:\n" << frg << "\n";
+      printShaderInfoLog(frg_id);
+    }
+    glDeleteShader(vtx_id);
+    glDeleteShader(frg_id);
+    return GL_ZERO;
   }
-  
+
   glAttachShader(prg_id, vtx_id);
   glAttachShader(prg_id, frg_id);
   glLinkProgram(prg_id);
-  glGetProgramiv(prg_id,GL_LINK_STATUS,&ok);
+
+  // Delete shaders once linked.
+  glDeleteShader(vtx_id);
+  glDeleteShader(frg_id);
+  glGetProgramiv(prg_id, GL_LINK_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "linking failed\n";
+    if (print_errors)
+    {
+      std::cerr << "linking failed.\n";
+      printProgramInfoLog(prg_id);
+    }
+    glDeleteProgram(prg_id);
+    return GL_ZERO;
   }
-  printInfoLog(prg_id);
-  
+
   glUseProgram(prg_id);
   return prg_id;
 }
 
-void test_openglsupport()
+GLint createProgram(const std::string& vtx, const std::string& frg, bool print_errors = true)
 {
-  int argc = 0;
-  glutInit(&argc, 0);
-  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
-  glutInitWindowPosition (0,0);
-  glutInitWindowSize(10, 10);
+  return createProgram(vtx.c_str(), frg.c_str(), print_errors);
+}
 
-  if(glutCreateWindow("Eigen") <= 0)
+std::string getGlslVersionString(int gl_major_version, int gl_minor_version)
+{
+  switch (gl_major_version)
   {
-    std::cerr << "Error: Unable to create GLUT Window.\n";
-    exit(1);
+    case 2:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 110";
+        case 1:
+          return "#version 120";
+      }
+      break;
+    case 3:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 130";
+        case 1:
+          return "#version 140";
+        case 2:
+          return "#version 150";
+        case 3:
+          return "#version 330";
+      }
+      break;
+    case 4:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 400";
+        case 1:
+          return "#version 410";
+        case 2:
+          return "#version 420";
+        case 3:
+          return "#version 430";
+        case 4:
+          return "#version 440";
+        case 5:
+          return "#version 450";
+        case 6:
+          return "#version 460";
+      }
+      break;
   }
-  
-  glewExperimental = GL_TRUE;
-  if(glewInit() != GLEW_OK)
-  {
-    std::cerr << "Warning: Failed to initialize GLEW\n";
+  return "";
+}
+
+void find_and_replace(
+  std::string& str,
+  const std::string& find,
+  const std::string& replace)
+{
+  size_t loc = 0;
+  size_t flen = find.length();
+  size_t rlen = replace.length();
+  while ( (loc = str.find(find, loc)) != std::string::npos) {
+    str.replace(loc, flen, replace);
+    loc += rlen;
   }
+}
 
-  Vector3f v3f;
-  Matrix3f rot;
-  glBegin(GL_POINTS);
-  
-  glVertex(v3f);
-  glVertex(2*v3f+v3f);
-  glVertex(rot*v3f);
-  
-  glEnd();
-  
-  // 4x4 matrices
-  Matrix4f mf44; mf44.setRandom();
-  VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
-  VERIFY_MATRIX(glMultMatrix(mf44), mf44);
-  Matrix4d md44; md44.setRandom();
-  VERIFY_MATRIX(glLoadMatrix(md44), md44);
-  VERIFY_MATRIX(glMultMatrix(md44), md44);
-  
-  // Quaternion
-  Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
-  VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
-  
-  Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
-  VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
-  
-  // 3D Transform
-  Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
-  VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
-  
-  Transform<float,3,Affine> af3(acf3);
-  VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
-  VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
-  
-  Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
-  VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
-  
-  Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
-  VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
-  
-  Transform<double,3,Affine> ad3(acd3);
-  VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
-  VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
-  
-  Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
-  VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
-  
-  // translations (2D and 3D)
-  {
-    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
-    VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
-    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
-    VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
-    
-    Vector3f vf3; vf3.setRandom();
-    VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
-    Vector3d vd3; vd3.setRandom();
-    VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
-    
-    Translation<float,3> tf3; tf3.vector().setRandom();
-    VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
-    
-    Translation<double,3> td3;  td3.vector().setRandom();
-    VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+// Finds and replaces a set of substrings in a string.
+std::string format(
+  const std::string& str,
+  const std::vector<std::string>& find,
+  const std::vector<std::string>& replace)
+{
+  std::string out = str;
+  for (std::size_t i=0; i<find.size(); ++i) {
+    find_and_replace(out, find[i], replace[i]);
   }
-  
-  // scaling (2D and 3D)
+  return out;
+}
+
+// GLUT display function that runs test.  Must be run within the display loop
+// in order to properly destroy resources.
+void openglsupport_test_loop()
+{
+  // Get context info.
+  const GLubyte* gl_version_string = glGetString(GL_VERSION);
+  std::cerr << "GL version: " << gl_version_string << std::endl;
+  std::cerr << "GLSL version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;
+  // Parse version from string since GL_MAJOR_VERSION is only supported in GL 3.0+.
+  // Version string guaranteed to be <major>.<minor><vender extension>.
+  GLint gl_major_version = gl_version_string[0] - '0';
+  GLint gl_minor_version = gl_version_string[2] - '0';
+  bool legacy_gl = gl_major_version < 3 || (gl_major_version == 3 && gl_minor_version < 2);
+
+  // Fixed-function pipeline removed in OpenGL 3.2.
+  if (legacy_gl)
   {
-    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
-    VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
-    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
-    VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
-    
-    Vector3f vf3; vf3.setRandom();
-    VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
-    Vector3d vd3; vd3.setRandom();
-    VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
-    
-    UniformScaling<float> usf(internal::random<float>());
-    VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
-    
-    UniformScaling<double> usd(internal::random<double>());
-    VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    // Draw a basic triangle.
+    Vector3f v3f;
+    Matrix3f rot;
+    glBegin(GL_POINTS);
+    {
+      glVertex(v3f);
+      glVertex(2*v3f+v3f);
+      glVertex(rot*v3f);
+    }
+    glEnd();
+
+    // 4x4 matrices
+    Matrix4f mf44; mf44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
+    VERIFY_MATRIX(glMultMatrix(mf44), mf44);
+    Matrix4d md44; md44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(md44), md44);
+    VERIFY_MATRIX(glMultMatrix(md44), md44);
+
+    // Quaternion
+    Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
+    VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
+
+    Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
+    VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
+
+    // 3D Transform
+    Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
+
+    Transform<float,3,Affine> af3(acf3);
+    VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
+    VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
+
+    Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
+
+    Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
+
+    Transform<double,3,Affine> ad3(acd3);
+    VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
+    VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
+
+    Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
+
+    // translations (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
+      VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
+      VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
+
+      Translation<float,3> tf3; tf3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
+
+      Translation<double,3> td3;  td3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+    }
+
+    // scaling (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
+      VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
+      VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
+
+      UniformScaling<float> usf(internal::random<float>());
+      VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
+
+      UniformScaling<double> usd(internal::random<double>());
+      VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    }
+  } else {
+    std::cerr << "Warning: fixed-function pipeline was not tested.\n";
+  }
+
+  // Dynamic shader substitution variables.
+  // Modern shaders require a version string, and newer runtimes fail to
+  // compile old GLSL versions. Thus, we dynamically set the GLSL version
+  // string based on runtime. Also, pre OpenGL 3.0, the output gl_FragColor was
+  // built-in. This was deprecated in OpenGL 3.0, requiring us to explicitly
+  // define the output variable.
+  std::vector<std::string> glsl_vars;
+  glsl_vars.push_back("${GLSL_VERSION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_DECLARATION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_VARIABLE}");
+
+  std::vector<std::string> glsl_vals;
+  glsl_vals.push_back(getGlslVersionString(gl_major_version, gl_minor_version));
+  if (gl_major_version >= 3) {
+    glsl_vals.push_back("out vec4 fragColor;");
+    glsl_vals.push_back("fragColor");
+  } else {
+    glsl_vals.push_back("");
+    glsl_vals.push_back("gl_FragColor");
   }
-  
+
   // uniform
   {
-    const char* vtx = "void main(void) { gl_Position = gl_Vertex; }\n";
-    
-    if(GLEW_VERSION_2_0)
+    // vertex shader.
+    std::string vtx = format(
+      "${GLSL_VERSION}\n"
+      "void main(void) {\n"
+      "  gl_Position = vec4(0,0,0,1);\n"
+      "}\n",
+      glsl_vars, glsl_vals);
+
+#ifdef GL_VERSION_2_0
+    if(GLEW_VERSION_2_0 && GL_VERSION_2_0)
     {
-      #ifdef GL_VERSION_2_0
-      const char* frg = ""
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform vec2 v2f;\n"
         "uniform vec3 v3f;\n"
         "uniform vec4 v4f;\n"
@@ -231,107 +389,212 @@ void test_openglsupport()
         "uniform mat2 m2f;\n"
         "uniform mat3 m3f;\n"
         "uniform mat4 m4f;\n"
-        "void main(void) { gl_FragColor = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
-      VERIFY_UNIFORM(fv,v2f, Vector2f);
-      VERIFY_UNIFORM(fv,v3f, Vector3f);
-      VERIFY_UNIFORM(fv,v4f, Vector4f);
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) { \n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(fv, v2f, Vector2f);
+      VERIFY_UNIFORM(fv, v3f, Vector3f);
+      VERIFY_UNIFORM(fv, v4f, Vector4f);
       VERIFY_UNIFORMi(v2i, Vector2i);
       VERIFY_UNIFORMi(v3i, Vector3i);
       VERIFY_UNIFORMi(v4i, Vector4i);
-      VERIFY_UNIFORM(fv,m2f, Matrix2f);
-      VERIFY_UNIFORM(fv,m3f, Matrix3f);
-      VERIFY_UNIFORM(fv,m4f, Matrix4f);
-      #endif
+      VERIFY_UNIFORM(fv, m2f, Matrix2f);
+      VERIFY_UNIFORM(fv, m3f, Matrix3f);
+      VERIFY_UNIFORM(fv, m4f, Matrix4f);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 2.0 was not tested\n";
-    
-    if(GLEW_VERSION_2_1)
+#endif
+      std::cerr << "Warning: opengl 2.0 was not tested.\n";
+
+#ifdef GL_VERSION_2_1
+    if(GLEW_VERSION_2_1 && GL_VERSION_2_1 &&
+        (gl_major_version > 2 || (gl_major_version == 2 && gl_minor_version >= 1)))
     {
-      #ifdef GL_VERSION_2_1
-      const char* frg = "#version 120\n"
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform mat2x3 m23f;\n"
         "uniform mat3x2 m32f;\n"
         "uniform mat2x4 m24f;\n"
         "uniform mat4x2 m42f;\n"
         "uniform mat3x4 m34f;\n"
         "uniform mat4x3 m43f;\n"
-        "void main(void) { gl_FragColor = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
       typedef Matrix<float,2,3> Matrix23f;
       typedef Matrix<float,3,2> Matrix32f;
       typedef Matrix<float,2,4> Matrix24f;
       typedef Matrix<float,4,2> Matrix42f;
       typedef Matrix<float,3,4> Matrix34f;
       typedef Matrix<float,4,3> Matrix43f;
-      
-      VERIFY_UNIFORM(fv,m23f, Matrix23f);
-      VERIFY_UNIFORM(fv,m32f, Matrix32f);
-      VERIFY_UNIFORM(fv,m24f, Matrix24f);
-      VERIFY_UNIFORM(fv,m42f, Matrix42f);
-      VERIFY_UNIFORM(fv,m34f, Matrix34f);
-      VERIFY_UNIFORM(fv,m43f, Matrix43f);
-      #endif
+
+      VERIFY_UNIFORM(fv, m23f, Matrix23f);
+      VERIFY_UNIFORM(fv, m32f, Matrix32f);
+      VERIFY_UNIFORM(fv, m24f, Matrix24f);
+      VERIFY_UNIFORM(fv, m42f, Matrix42f);
+      VERIFY_UNIFORM(fv, m34f, Matrix34f);
+      VERIFY_UNIFORM(fv, m43f, Matrix43f);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 2.1 was not tested\n";
-    
-    if(GLEW_VERSION_3_0)
+#endif
+      std::cerr << "Warning: opengl 2.1 was not tested.\n";
+
+#ifdef GL_VERSION_3_0
+    if(GLEW_VERSION_3_0 && GL_VERSION_3_0 && gl_major_version >= 3)
     {
-      #ifdef GL_VERSION_3_0
-      const char* frg = "#version 150\n"
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform uvec2 v2ui;\n"
         "uniform uvec3 v3ui;\n"
         "uniform uvec4 v4ui;\n"
-        "out vec4 data;\n"
-        "void main(void) { data = vec4(v2ui[0]+v3ui[0]+v4ui[0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2ui[0]+v3ui[0]+v4ui[0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
       typedef Matrix<unsigned int,2,1> Vector2ui;
       typedef Matrix<unsigned int,3,1> Vector3ui;
       typedef Matrix<unsigned int,4,1> Vector4ui;
-      
+
       VERIFY_UNIFORMi(v2ui, Vector2ui);
       VERIFY_UNIFORMi(v3ui, Vector3ui);
       VERIFY_UNIFORMi(v4ui, Vector4ui);
-      #endif
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 3.0 was not tested\n";
-    
-    #ifdef GLEW_ARB_gpu_shader_fp64
+#endif
+      std::cerr << "Warning: opengl 3.0 was not tested.\n";
+
+    // dvecn supported if >= 4.1 or ARB_vertex_attrib_64bit
+    bool has_fp64_native = (gl_major_version == 4 && gl_minor_version >= 1);
+    bool has_fp64_extension = false;
+#ifdef GLEW_ARB_gpu_shader_fp64
     if(GLEW_ARB_gpu_shader_fp64)
     {
-      #ifdef GL_ARB_gpu_shader_fp64
-      const char* frg = "#version 150\n"
+      // Check that extension can actually be compiled.
+      if (has_fp64_extension)
+      {
+        std::string frg = format(
+          "${GLSL_VERSION}\n"
+          "#extension GL_ARB_gpu_shader_fp64 : enable\n"
+          "uniform dvec2 dv2;\n"
+          "${FRAG_OUTPUT_DECLARATION}\n"
+          "void main(void) {\n"
+          "  ${FRAG_OUTPUT_VARIABLE} = vec4(dv2.x, dv2.y, dv2.x, dv2.y);\n"
+          "}\n",
+          glsl_vars, glsl_vals);
+        GLint prg_id = createProgram(vtx, frg, /*print_errors=*/false);
+        if (prg_id)
+        {
+          has_fp64_extension = true;
+          glDeleteProgram(prg_id);
+        }
+      }
+    }
+#endif
+
+    if( has_fp64_native || has_fp64_extension )
+    {
+      std::vector<std::string> glsl_vars_with_extension = glsl_vars;
+      glsl_vars_with_extension.push_back("${GLSL_EXTENSIONS}");
+      std::vector<std::string> glsl_vals_with_extension = glsl_vals;
+      if (has_fp64_extension)
+      {
+        glsl_vals_with_extension.push_back("#extension GL_ARB_gpu_shader_fp64 : enable");
+      }
+      else
+      {
+        glsl_vals_with_extension.push_back("");
+      }
+
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "${GLSL_EXTENSIONS}\n"
         "uniform dvec2 v2d;\n"
         "uniform dvec3 v3d;\n"
         "uniform dvec4 v4d;\n"
-        "out vec4 data;\n"
-        "void main(void) { data = vec4(v2d[0]+v3d[0]+v4d[0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
-      typedef Vector2d Vector2d;
-      typedef Vector3d Vector3d;
-      typedef Vector4d Vector4d;
-      
-      VERIFY_UNIFORM(dv,v2d, Vector2d);
-      VERIFY_UNIFORM(dv,v3d, Vector3d);
-      VERIFY_UNIFORM(dv,v4d, Vector4d);
-      #endif
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2d[0]+v3d[0]+v4d[0]);\n"
+        "}\n",
+        glsl_vars_with_extension, glsl_vals_with_extension);
+
+      GLint prg_id = createProgram(vtx,frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(dv, v2d, Vector2d);
+      VERIFY_UNIFORM(dv, v3d, Vector3d);
+      VERIFY_UNIFORM(dv, v4d, Vector4d);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
-    #else
-      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
-    #endif
+      std::cerr << "Warning: dvec (fp64) was not tested.\n";
   }
-  
+
+  // Exit loop - Leaving main loop is supported by freeglut, otherwise we
+  // are forced to exit.
+#ifdef FREEGLUT
+  glutLeaveMainLoop();
+  // Trigger another display loop iteration. Otherwise, it just hangs.
+  glutPostRedisplay();
+#else
+  exit(0);
+#endif
+}
+
+EIGEN_DECLARE_TEST(openglsupport)
+{
+  int argc = 0;
+  glutInit(&argc, 0);
+
+  GLint glut_display_mode = GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH;
+
+#ifndef EIGEN_LEGACY_OPENGL
+  // Initialize 3.2+ OpenGL context.
+#if defined(__APPLE_CC__)
+  glut_display_mode |= GLUT_3_2_CORE_PROFILE;
+#elif defined(FREEGLUT)
+  glutInitContextVersion(3, 2);
+  glutInitContextFlags(GLUT_FORWARD_COMPATIBLE);
+  glutInitContextProfile(GLUT_CORE_PROFILE);
+#endif
+#endif
+
+  glutInitDisplayMode(glut_display_mode);
+  glutInitWindowPosition(0, 0);
+  glutInitWindowSize(10, 10);
+
+  int window = glutCreateWindow("Eigen");
+  if(window <= 0)
+  {
+    std::cerr << "Error: Unable to create GLUT Window.\n";
+    exit(1);
+  }
+
+  glewExperimental = GL_TRUE;
+  if(glewInit() != GLEW_OK)
+  {
+    std::cerr << "Warning: Failed to initialize GLEW.\n";
+    exit(1);
+  }
+
+  // Run test in display, otherwise GLUT fails to clean up and leads to memory
+  // access errors on exit.
+  glutDisplayFunc(openglsupport_test_loop);
+  glutMainLoop();
+  glutDestroyWindow(window);
 }
diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
index 0c87478dd..4ff9bda5a 100644
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp
@@ -26,15 +26,25 @@ struct increment_if_fixed_size
 }
 }
 
+template<typename PolynomialType>
+PolynomialType polyder(const PolynomialType& p)
+{
+  typedef typename PolynomialType::Scalar Scalar;
+  PolynomialType res(p.size());
+  for(Index i=1; i<p.size(); ++i)
+    res[i-1] = p[i]*Scalar(i);
+  res[p.size()-1] = 0.;
+  return res;
+}
 
 template<int Deg, typename POLYNOMIAL, typename SOLVER>
 bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
 {
-  typedef typename POLYNOMIAL::Index Index;
   typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
 
   typedef typename SOLVER::RootsType    RootsType;
-  typedef Matrix<Scalar,Deg,1>          EvalRootsType;
+  typedef Matrix<RealScalar,Deg,1>      EvalRootsType;
 
   const Index deg = pols.size()-1;
 
@@ -44,10 +54,17 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
   psolve.compute( pols );
   const RootsType& roots( psolve.roots() );
   EvalRootsType evr( deg );
+  POLYNOMIAL pols_der = polyder(pols);
+  EvalRootsType der( deg );
   for( int i=0; i<roots.size(); ++i ){
-    evr[i] = std::abs( poly_eval( pols, roots[i] ) ); }
+    evr[i] = std::abs( poly_eval( pols, roots[i] ) );
+    der[i] = numext::maxi(RealScalar(1.), std::abs( poly_eval( pols_der, roots[i] ) ));
+  }
 
-  bool evalToZero = evr.isZero( test_precision<Scalar>() );
+  // we need to divide by the magnitude of the derivative because
+  // with a high derivative is very small error in the value of the root
+  // yiels a very large error in the polynomial evaluation.
+  bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision<Scalar>() );
   if( !evalToZero )
   {
     cerr << "WRONG root: " << endl;
@@ -57,7 +74,7 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
     cerr << endl;
   }
 
-  std::vector<Scalar> rootModuli( roots.size() );
+  std::vector<RealScalar> rootModuli( roots.size() );
   Map< EvalRootsType > aux( &rootModuli[0], roots.size() );
   aux = roots.array().abs();
   std::sort( rootModuli.begin(), rootModuli.end() );
@@ -83,7 +100,7 @@ void evalSolver( const POLYNOMIAL& pols )
 {
   typedef typename POLYNOMIAL::Scalar Scalar;
 
-  typedef PolynomialSolver<Scalar, Deg >              PolynomialSolverType;
+  typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
 
   PolynomialSolverType psolve;
   aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve );
@@ -97,6 +114,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
 {
   using std::sqrt;
   typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
 
   typedef PolynomialSolver<Scalar, Deg >              PolynomialSolverType;
 
@@ -107,15 +125,12 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
     // 1) the roots found are correct
     // 2) the roots have distinct moduli
 
-    typedef typename POLYNOMIAL::Scalar                 Scalar;
-    typedef typename REAL_ROOTS::Scalar                 Real;
-
     //Test realRoots
-    std::vector< Real > calc_realRoots;
-    psolve.realRoots( calc_realRoots );
-    VERIFY( calc_realRoots.size() == (size_t)real_roots.size() );
+    std::vector< RealScalar > calc_realRoots;
+    psolve.realRoots( calc_realRoots,  test_precision<RealScalar>());
+    VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() );
 
-    const Scalar psPrec = sqrt( test_precision<Scalar>() );
+    const RealScalar psPrec = sqrt( test_precision<RealScalar>() );
 
     for( size_t i=0; i<calc_realRoots.size(); ++i )
     {
@@ -138,7 +153,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
 
     bool hasRealRoot;
     //Test absGreatestRealRoot
-    Real r = psolve.absGreatestRealRoot( hasRealRoot );
+    RealScalar r = psolve.absGreatestRealRoot( hasRealRoot );
     VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
     if( hasRealRoot ){
       VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) );  }
@@ -167,9 +182,11 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
 template<typename _Scalar, int _Deg>
 void polynomialsolver(int deg)
 {
-  typedef internal::increment_if_fixed_size<_Deg>            Dim;
+  typedef typename NumTraits<_Scalar>::Real RealScalar;
+  typedef internal::increment_if_fixed_size<_Deg>     Dim;
   typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
   typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+  typedef Matrix<RealScalar,_Deg,1>                   RealRootsType;
 
   cout << "Standard cases" << endl;
   PolynomialType pols = PolynomialType::Random(deg+1);
@@ -182,19 +199,15 @@ void polynomialsolver(int deg)
   evalSolver<_Deg,PolynomialType>( pols );
 
   cout << "Test sugar" << endl;
-  EvalRootsType realRoots = EvalRootsType::Random(deg);
+  RealRootsType realRoots = RealRootsType::Random(deg);
   roots_to_monicPolynomial( realRoots, pols );
   evalSolverSugarFunction<_Deg>(
       pols,
-      realRoots.template cast <
-                    std::complex<
-                         typename NumTraits<_Scalar>::Real
-                         >
-                    >(),
+      realRoots.template cast <std::complex<RealScalar> >().eval(),
       realRoots );
 }
 
-void test_polynomialsolver()
+EIGEN_DECLARE_TEST(polynomialsolver)
 {
   for(int i = 0; i < g_repeat; i++)
   {
@@ -214,5 +227,6 @@ void test_polynomialsolver()
             internal::random<int>(9,13)
             )) );
     CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
+    CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) );
   }
 }
diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp
index 5fc968402..8ff451996 100644
--- a/unsupported/test/polynomialutils.cpp
+++ b/unsupported/test/polynomialutils.cpp
@@ -101,7 +101,7 @@ template<typename _Scalar> void CauchyBounds_scalar()
           internal::random<int>(18,26) )) );
 }
 
-void test_polynomialutils()
+EIGEN_DECLARE_TEST(polynomialutils)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index a010ceb93..602c2cb84 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -8,10 +8,45 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-// import basic and product tests for deprectaed DynamicSparseMatrix
+// import basic and product tests for deprecated DynamicSparseMatrix
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+
+#define EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA 1
+#endif
+
 #define EIGEN_NO_DEPRECATED_WARNING
-#include "sparse_basic.cpp"
+// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix)
+// has an extra copy-assignment.
+#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
 #include "sparse_product.cpp"
+
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+#include "sparse_basic.cpp"
+#endif
+
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+
 #include <Eigen/SparseExtra>
 
 template<typename SetterType,typename DenseType, typename Scalar, int Options>
@@ -104,10 +139,8 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
     #ifdef EIGEN_UNORDERED_MAP_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
-    #ifdef _DENSE_HASH_MAP_H_
+    #ifdef EIGEN_GOOGLEHASH_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-    #ifdef _SPARSE_HASH_MAP_H_
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
 
@@ -129,7 +162,32 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
 
 }
 
-void test_sparse_extra()
+
+template<typename SparseMatrixType>
+void check_marketio()
+{
+  typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix;
+  Index rows = internal::random<Index>(1,100);
+  Index cols = internal::random<Index>(1,100);
+  SparseMatrixType m1, m2;
+  m1 = DenseMatrix::Random(rows, cols).sparseView();
+  saveMarket(m1, "sparse_extra.mtx");
+  loadMarket(m2, "sparse_extra.mtx");
+  VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
+}
+
+template<typename VectorType>
+void check_marketio_vector()
+{
+  Index size = internal::random<Index>(1,100);
+  VectorType v1, v2;
+  v1 = VectorType::Random(size);
+  saveMarketVector(v1, "vector_extra.mtx");
+  loadMarketVector(v2, "vector_extra.mtx");
+  VERIFY_IS_EQUAL(v1,v2);
+}
+
+EIGEN_DECLARE_TEST(sparse_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,50);
@@ -143,5 +201,26 @@ void test_sparse_extra()
 
     CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
     CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
+
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
+
+
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s);
   }
 }
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 057fb3e92..589bb76e1 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -7,9 +7,21 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <limits.h>
 #include "main.h"
 #include "../Eigen/SpecialFunctions"
 
+// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+template<typename Derived>
+Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
+  return (dense << static_cast<typename Derived::Scalar>(v));
+}
+
+template<typename XprType>
+Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) {
+  return (ci, static_cast<typename XprType::Scalar>(v));
+}
+
 template<typename X, typename Y>
 void verify_component_wise(const X& x, const Y& y)
 {
@@ -64,8 +76,8 @@ template<typename ArrayType> void array_special_functions()
       //   igamma(a, x) = gamma(a, x) / Gamma(a)
       // where Gamma and gamma are considered the standard unnormalized
       // upper and lower incomplete gamma functions, respectively.
-      ArrayType a = m1.abs() + 2;
-      ArrayType x = m2.abs() + 2;
+      ArrayType a = m1.abs() + Scalar(2);
+      ArrayType x = m2.abs() + Scalar(2);
       ArrayType zero = ArrayType::Zero(rows, cols);
       ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
       ArrayType a_m1 = a - one;
@@ -74,6 +86,7 @@ template<typename ArrayType> void array_special_functions()
       ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
       ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
 
+
       // Gamma(a, 0) == Gamma(a)
       VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
 
@@ -81,10 +94,23 @@ template<typename ArrayType> void array_special_functions()
       VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
 
       // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+      VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp());
 
       // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+      VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp());
+    }
+    {
+      // Verify for large a and x that values are between 0 and 1.
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+      int max_exponent = std::numeric_limits<Scalar>::max_exponent10;
+      ArrayType a = m1.abs() *  Scalar(pow(10., max_exponent - 1));
+      ArrayType x = m2.abs() *  Scalar(pow(10., max_exponent - 1));
+      for (int i = 0; i < a.size(); ++i) {
+        Scalar igam = numext::igamma(a(i), x(i));
+        VERIFY(0 <= igam);
+        VERIFY(igam <= 1);
+      }
     }
 
     {
@@ -93,27 +119,37 @@ template<typename ArrayType> void array_special_functions()
       Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
 
       // location i*6+j corresponds to a_s[i], x_s[j].
-      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
-                              {0.0, 0.6321205588285578, 0.7768698398515702,
-                              0.9816843611112658, 9.999500016666262e-05, 1.0},
-                              {0.0, 0.4275932955291202, 0.608374823728911,
-                              0.9539882943107686, 7.522076445089201e-07, 1.0},
-                              {0.0, 0.01898815687615381, 0.06564245437845008,
-                              0.5665298796332909, 4.166333347221828e-18, 1.0},
-                              {0.0, 0.9999780593618628, 0.9999899967080838,
-                              0.9999996219837988, 0.9991370418689945, 1.0},
-                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
-                              {1.0, 0.36787944117144233, 0.22313016014842982,
-                                0.018315638888734182, 0.9999000049998333, 0.0},
-                              {1.0, 0.5724067044708798, 0.3916251762710878,
-                                0.04601170568923136, 0.9999992477923555, 0.0},
-                              {1.0, 0.9810118431238462, 0.9343575456215499,
-                                0.4334701203667089, 1.0, 0.0},
-                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
-                                3.7801620118431334e-07, 0.0008629581310054535,
-                                0.0},
-                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      Scalar igamma_s[][6] = {
+          {Scalar(0.0), nan, nan, nan, nan, nan},
+          {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702),
+           Scalar(0.9816843611112658), Scalar(9.999500016666262e-05),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911),
+           Scalar(0.9539882943107686), Scalar(7.522076445089201e-07),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.01898815687615381),
+           Scalar(0.06564245437845008), Scalar(0.5665298796332909),
+           Scalar(4.166333347221828e-18), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838),
+           Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0),
+           Scalar(0.5042041932513908)}};
+      Scalar igammac_s[][6] = {
+          {nan, nan, nan, nan, nan, nan},
+          {Scalar(1.0), Scalar(0.36787944117144233),
+           Scalar(0.22313016014842982), Scalar(0.018315638888734182),
+           Scalar(0.9999000049998333), Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878),
+           Scalar(0.04601170568923136), Scalar(0.9999992477923555),
+           Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499),
+           Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)},
+          {Scalar(1.0), Scalar(2.1940638138146658e-05),
+           Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07),
+           Scalar(0.0008629581310054535), Scalar(0.0)},
+          {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0),
+           Scalar(0.49579580674813944)}};
+
       for (int i = 0; i < 6; ++i) {
         for (int j = 0; j < 6; ++j) {
           if ((std::isnan)(igamma_s[i][j])) {
@@ -133,12 +169,32 @@ template<typename ArrayType> void array_special_functions()
   }
 #endif  // EIGEN_HAS_C99_MATH
 
+  // Check the ndtri function against scipy.special.ndtri
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
+    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
+
+    // ndtri(normal_cdf(x)) ~= x
+    CALL_SUBTEST(
+        ArrayType m1 = ArrayType::Random(32);
+        using std::sqrt;
+
+        ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf();
+        cdf_val = (cdf_val + Scalar(1)) / Scalar(2);
+        verify_component_wise(cdf_val.ndtri(), m1););
+
+  }
+
   // Check the zeta function against scipy.special.zeta
   {
-    ArrayType x(7), q(7), res(7), ref(7);
-    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
-    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
-    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    ArrayType x(10), q(10), res(10), ref(10);
+    x << 1.5,   4, 10.5, 10000.5,    3,      1,    0.9,  2,  3,  4;
+    q <<   2, 1.5,    3,  1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
     CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
@@ -146,22 +202,21 @@ template<typename ArrayType> void array_special_functions()
 
   // digamma
   {
-    ArrayType x(7), res(7), ref(7);
-    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
-    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    ArrayType x(9), res(9), ref(9);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1, -2, -3;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, nan, nan, nan, nan;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
 
     CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
   }
 
-
 #if EIGEN_HAS_C99_MATH
   {
-    ArrayType n(11), x(11), res(11), ref(11);
-    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
-    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
-    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    ArrayType n(16), x(16), res(16), ref(16);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8,   42,  147, 170, -1,  0,  1,  2,  3;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1,  64, -1, -2, -3, -4, -5;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927, nan, nan, plusinf, nan, plusinf;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
 
     if(sizeof(RealScalar)>=8) {  // double
@@ -288,8 +343,8 @@ template<typename ArrayType> void array_special_functions()
     ArrayType m3 = ArrayType::Random(32);
     ArrayType one = ArrayType::Constant(32, Scalar(1.0));
     const Scalar eps = std::numeric_limits<Scalar>::epsilon();
-    ArrayType a = (m1 * 4.0).exp();
-    ArrayType b = (m2 * 4.0).exp();
+    ArrayType a = (m1 * Scalar(4)).exp();
+    ArrayType b = (m2 * Scalar(4)).exp();
     ArrayType x = m3.abs();
 
     // betainc(a, 1, x) == x**a
@@ -335,11 +390,108 @@ template<typename ArrayType> void array_special_functions()
         ArrayType test = betainc(a, b + one, x) + eps;
         verify_component_wise(test, expected););
   }
-#endif
+#endif  // EIGEN_HAS_C99_MATH
+
+    /* Code to generate the data for the following two test cases.
+    N = 5
+    np.random.seed(3)
+
+    a = np.logspace(-2, 3, 6)
+    a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N]))
+    x = np.random.gamma(a, 1.0)
+    x = np.maximum(x, np.finfo(np.float32).tiny)
+
+    def igamma(a, x):
+      return mpmath.gammainc(a, 0, x, regularized=True)
+
+    def igamma_der_a(a, x):
+      res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a)
+      return np.float64(res)
+
+    def gamma_sample_der_alpha(a, x):
+      igamma_x = igamma(a, x)
+      def igammainv_of_igamma(a_prime):
+        return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) -
+            igamma_x, x, solver='newton')
+      return np.float64(mpmath.diff(igammainv_of_igamma, a))
+
+    v_igamma_der_a = np.vectorize(igamma_der_a)(a, x)
+    v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
+  */
+
+#if EIGEN_HAS_C99_MATH
+  // Test igamma_der_a
+  {
+    ArrayType a(30);
+    ArrayType x(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0,
+        1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514,
+        -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596,
+        -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323,
+        -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522,
+        -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294,
+        -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921,
+        -0.00794899153653, -0.00777303219211, -0.00796085782042,
+        -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+    CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v););
+  }
+
+  // Test gamma_sample_der_alpha
+  {
+    ArrayType alpha(30);
+    ArrayType sample(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+        1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+        1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+        0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+        1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+        0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+        1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+        0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+        1.00106492525, 0.97734200649, 1.02198794179;
+
+    CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
+                 verify_component_wise(res, v););
+  }
+#endif  // EIGEN_HAS_C99_MATH
 }
 
-void test_special_functions()
+EIGEN_DECLARE_TEST(special_functions)
 {
   CALL_SUBTEST_1(array_special_functions<ArrayXf>());
   CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+  // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above.
+  // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>());
+  // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>());
 }
diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
new file mode 100644
index 000000000..31233f1b0
--- /dev/null
+++ b/unsupported/test/special_packetmath.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <limits>
+#include "packetmath_test_shared.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename Scalar,typename Packet> void packetmath_real()
+{
+  using std::abs;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  const int size = PacketSize*4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
+
+#if EIGEN_HAS_C99_MATH
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+    h.store(data2, internal::plgamma(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  if (internal::packet_traits<Scalar>::HasErf) {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+    h.store(data2, internal::perf(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+    h.store(data2, internal::perfc(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1));
+    }
+    CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // For bessel_i*e and bessel_j*, the valid range is negative reals.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+    }
+
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+  }
+
+  // Use a smaller data range for the bessel_i* as these can become very large.
+  // Following #1693, we also restrict this range further to avoid inf's due to
+  // differences in pexp and exp.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1);
+
+
+  // y_i, and k_i are valid for x > 0.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+    }
+  }
+
+  // TODO(srvasude): Re-enable this test once properly investigated why the
+  // scalar and vector paths differ.
+  // CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y0, internal::pbessel_y0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y1, internal::pbessel_y1);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0e, internal::pbessel_k0e);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1e, internal::pbessel_k1e);
+
+  // Following #1693, we restrict the range for exp to avoid zeroing out too
+  // fast.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1);
+
+
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+}
+
+namespace Eigen {
+namespace test {
+
+template<typename Scalar,typename PacketType, bool IsComplex, bool IsInteger>
+struct runall {
+  static void run() {
+    packetmath_real<Scalar,PacketType>();
+  }
+};
+
+}
+}
+
+EIGEN_DECLARE_TEST(special_packetmath)
+{
+  g_first_pass = true;
+  for(int i = 0; i < g_repeat; i++) {
+
+    CALL_SUBTEST_1( test::runner<float>::run() );
+    CALL_SUBTEST_2( test::runner<double>::run() );
+    CALL_SUBTEST_3( test::runner<Eigen::half>::run() );
+    CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() );
+    g_first_pass = false;
+  }
+}
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index 3be020434..88ec87b97 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d()
   }
 }
 
-void test_splines()
+EIGEN_DECLARE_TEST(splines)
 {
   for (int i = 0; i < g_repeat; ++i)
   {