Merge changes Iee153445,Iee274471 am: 79df15ea88 am: 10f298fc41 am: 7cb5001398 am: bc0f5df265aml_uwb_331910010 aml_uwb_331820070 aml_uwb_331613010 aml_uwb_331611010 aml_uwb_331410010 aml_uwb_331310030 aml_uwb_331115000 aml_uwb_331015040 aml_uwb_330810010 aml_tz4_332714070 aml_tz4_332714050 aml_tz4_332714010 aml_tz4_331910000 aml_tz4_331314030 aml_tz4_331314020 aml_tz4_331314010 aml_tz4_331012050 aml_tz4_331012040 aml_tz4_331012000 aml_ase_331311020 aml_ase_331112000 aml_ase_331011020 android13-mainline-uwb-release android13-mainline-tzdata4-release android13-mainline-appsearch-release aml_tz4_332714010

Original change: https://android-review.googlesource.com/c/platform/external/eigen/+/1999079 Change-Id: Ife39d10c8b23d3eeb174cd52f462f9d20527ad03
author: Yi Kong <yikong@google.com> 2022-02-25 17:02:53 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2022-02-25 17:02:53 +0000
commit: edb0ad5bb04b48aab7dd0978f0475edd3550de7c (patch)
tree: fb979fb4cf4f8052c8cc66b1ec9516d91fcd859b /unsupported/test
parent: 8fd413e275f78a4c240f1442ce5cf77c73a20a55 (diff)
parent: bc0f5df265caa21a2120c22453655a7fcc941991 (diff)
download: eigen-edb0ad5bb04b48aab7dd0978f0475edd3550de7c.tar.gz
131 files changed, 15434 insertions, 4773 deletions
diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp
index ff5b3299d..d8c39d556 100644
--- a/unsupported/test/BVH.cpp
+++ b/unsupported/test/BVH.cpp
@@ -192,7 +192,7 @@ struct TreeTest
 };
 
 
-void test_BVH()
+EIGEN_DECLARE_TEST(BVH)
 {
   for(int i = 0; i < g_repeat; i++) {
 #ifdef EIGEN_TEST_PART_1
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index b5fa1c845..d30fa62bd 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,16 +1,7 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
-  foreach(i RANGE 1 999)
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
-      "#ifdef EIGEN_TEST_PART_${i}\n"
-      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
-      "#else\n"
-      "#define CALL_SUBTEST_${i}(FUNC)\n"
-      "#endif\n\n"
-    )
-  endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
 endif()
 
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
@@ -22,22 +13,27 @@ include_directories(../../test ../../unsupported ../../Eigen
 find_package (Threads)
 
 find_package(GoogleHash)
-if(GOOGLEHASH_FOUND)
+if(GoogleHash_FOUND)
   add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
   include_directories(${GOOGLEHASH_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "GoogleHash, ")
-else(GOOGLEHASH_FOUND)
+else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "GoogleHash, ")
-endif(GOOGLEHASH_FOUND)
+endif()
+
 
 find_package(Adolc)
-if(ADOLC_FOUND)
+if(Adolc_FOUND)
   include_directories(${ADOLC_INCLUDES})
   ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
-  ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
-else(ADOLC_FOUND)
+  if(EIGEN_TEST_CXX11)
+    ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+  else()
+    message(STATUS "Adolc found, but tests require C++11 mode")
+  endif()
+else()
   ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
-endif(ADOLC_FOUND)
+endif()
 
 # this test seems to never have been successful on x87, so is considered to contain a FP-related bug.
 # see thread: "non-linear optimization test summary"
@@ -47,9 +43,7 @@ ei_add_test(NumericalDiff)
 ei_add_test(autodiff_scalar)
 ei_add_test(autodiff)
 
-if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
 ei_add_test(BVH)
-endif()
 
 ei_add_test(matrix_exponential)
 ei_add_test(matrix_function)
@@ -61,13 +55,11 @@ ei_add_test(FFT)
 
 ei_add_test(EulerAngles)
 
-find_package(MPFR 2.3.0)
-find_package(GMP)
-if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
-  include_directories(${MPFR_INCLUDES} ./mpreal)
+find_package(MPREAL)
+if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
-  set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
- ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+  include_directories(${MPREAL_INCLUDES})
+  ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -87,8 +79,8 @@ else()
   ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
 endif()
 
-option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF)
-if(NOT EIGEN_TEST_NO_OPENGL)
+option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
+if(EIGEN_TEST_OPENGL)
   find_package(OpenGL)
   find_package(GLUT)
   find_package(GLEW)
@@ -108,89 +100,192 @@ ei_add_test(polynomialsolver)
 ei_add_test(polynomialutils)
 ei_add_test(splines)
 ei_add_test(gmres)
+ei_add_test(dgmres)
 ei_add_test(minres)
+ei_add_test(idrs)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
+ei_add_test(bessel_functions)
 ei_add_test(special_functions)
-
-# TODO: The following test names are prefixed with the cxx11 string, since historically
-# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
-# when using visual studio. We should make the check more strict to enable the tests for
-# newer versions of MSVC.
-if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-ei_add_test(cxx11_tensor_dimension)
-ei_add_test(cxx11_tensor_map)
-ei_add_test(cxx11_tensor_assign)
-ei_add_test(cxx11_tensor_comparisons)
-ei_add_test(cxx11_tensor_forced_eval)
-ei_add_test(cxx11_tensor_math)
-ei_add_test(cxx11_tensor_const)
-ei_add_test(cxx11_tensor_intdiv)
-ei_add_test(cxx11_tensor_casts)
-ei_add_test(cxx11_tensor_empty)
-ei_add_test(cxx11_tensor_sugar)
-ei_add_test(cxx11_tensor_roundings)
-ei_add_test(cxx11_tensor_layout_swap)
-ei_add_test(cxx11_tensor_io)
-if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-  # This test requires __uint128_t which is only available on 64bit systems
-  ei_add_test(cxx11_tensor_uint128)
-endif()
-endif()
+ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
 
 if(EIGEN_TEST_CXX11)
   if(EIGEN_TEST_SYCL)
-    ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
-    ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
-  endif(EIGEN_TEST_SYCL)
-  # It should be safe to always run these tests as there is some fallback code for
-  # older compiler that don't support cxx11.
-  set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_SYCL ON)
+    # Forward CMake options as preprocessor definitions
+    if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+      add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+    endif()
+    if(EIGEN_SYCL_NO_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_LOCAL_MEM)
+      add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+    endif()
+    if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+      add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+    endif()
+    if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+      add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+    endif()
+    if(EIGEN_SYCL_REG_M)
+      add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+    endif()
+    if(EIGEN_SYCL_REG_N)
+      add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+    endif()
+    if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+      add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+    endif()
+    if(EIGEN_SYCL_ASYNC_EXECUTION)
+      add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SKINNY)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+    endif()
+    if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+    add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+  endif()
+    if(EIGEN_SYCL_DISABLE_RANK1)
+      add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+    endif()
+    if(EIGEN_SYCL_DISABLE_SCALAR)
+      add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+    endif()
+    if(EIGEN_SYCL_DISABLE_GEMV)
+      add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+    endif()
+    if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+      add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+    endif()
+
+    if(EIGEN_SYCL_TRISYCL)
+      # triSYCL now requires c++17.
+      set(CMAKE_CXX_STANDARD 17)
+    else()
+      if(MSVC)
+        # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+        # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+        set(CMAKE_CXX_STANDARD 14)
+        list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+      else()
+        set(CMAKE_CXX_STANDARD 11)
+        list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+      endif()
+      # The following flags are not supported by Clang and can cause warnings
+      # if used with -Werror so they are removed here.
+      if(COMPUTECPP_USE_COMPILER_DRIVER)
+        set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+        string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      endif()
+      list(APPEND COMPUTECPP_USER_FLAGS
+          -DEIGEN_NO_ASSERTION_CHECKING=1
+          -no-serial-memop
+          -Xclang
+          -cl-mad-enable)
+    endif()
+
+    ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+    ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+    set(EIGEN_SYCL OFF)
+  endif()
 
   ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
 
   ei_add_test(cxx11_meta)
-  ei_add_test(cxx11_tensor_simple)
-#  ei_add_test(cxx11_tensor_symmetry)
-  ei_add_test(cxx11_tensor_index_list)
-  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_maxsizevector)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_assign)
+  ei_add_test(cxx11_tensor_block_access)
+  ei_add_test(cxx11_tensor_block_eval)
+  ei_add_test(cxx11_tensor_block_io)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_casts)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_comparisons)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_const)
   ei_add_test(cxx11_tensor_contraction)
   ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_dimension)
+  ei_add_test(cxx11_tensor_empty)
+  ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_fft)
   ei_add_test(cxx11_tensor_fixed_size)
-  ei_add_test(cxx11_tensor_of_const_values)
-  ei_add_test(cxx11_tensor_of_complex)
-  ei_add_test(cxx11_tensor_of_strings)
-  ei_add_test(cxx11_tensor_lvalue)
-  ei_add_test(cxx11_tensor_broadcasting)
-  ei_add_test(cxx11_tensor_chipping)
-  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_forced_eval)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_index_list)
   ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_intdiv)
+  ei_add_test(cxx11_tensor_io)
+  ei_add_test(cxx11_tensor_layout_swap)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_map)
+  ei_add_test(cxx11_tensor_math)
+  ei_add_test(cxx11_tensor_mixed_indices)
   ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_move)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_strings)
   ei_add_test(cxx11_tensor_padding)
   ei_add_test(cxx11_tensor_patch)
-  ei_add_test(cxx11_tensor_image_patch)
-  ei_add_test(cxx11_tensor_volume_patch)
+  ei_add_test(cxx11_tensor_random)
   ei_add_test(cxx11_tensor_reduction)
-  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_roundings)
+  ei_add_test(cxx11_tensor_scan)
   ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_simple)
   ei_add_test(cxx11_tensor_striding)
-  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_sugar)
+  ei_add_test(cxx11_tensor_thread_local "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
   ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
-  ei_add_test(cxx11_tensor_ref)
-  ei_add_test(cxx11_tensor_random)
-  ei_add_test(cxx11_tensor_generator)
-  ei_add_test(cxx11_tensor_custom_op)
-  ei_add_test(cxx11_tensor_custom_index)
-  ei_add_test(cxx11_tensor_fft)
-  ei_add_test(cxx11_tensor_ifft)
-  ei_add_test(cxx11_tensor_scan)
+  ei_add_test(cxx11_tensor_trace)
+  ei_add_test(cxx11_tensor_volume_patch)
+#  ei_add_test(cxx11_tensor_symmetry)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
 
 endif()
 
@@ -213,7 +308,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
   endif()
   if(EIGEN_TEST_CUDA_CLANG)
-   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+    foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+        string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}")
+    endforeach()
   endif()
 
   set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
@@ -221,37 +320,98 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
   endif()
 
-  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
-    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
-  else()
-    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
-    set(EIGEN_CUDA_CXX11_FLAG "")
-  endif()
-
-  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
+  set(NVCC_ARCH_FLAGS)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+  endforeach()
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
-  ei_add_test(cxx11_tensor_complex_cuda)
-  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
-  ei_add_test(cxx11_tensor_reduction_cuda)
-  ei_add_test(cxx11_tensor_argmax_cuda)
-  ei_add_test(cxx11_tensor_cast_float16_cuda)
-  ei_add_test(cxx11_tensor_scan_cuda)
+  ei_add_test(cxx11_tensor_complex_gpu)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+  ei_add_test(cxx11_tensor_reduction_gpu)
+  ei_add_test(cxx11_tensor_argmax_gpu)
+  ei_add_test(cxx11_tensor_cast_float16_gpu)
+  ei_add_test(cxx11_tensor_scan_gpu)
+
+  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
+      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
+    endif()
+  endforeach()
 
   # Contractions require arch 3.0 or higher
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
     ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_cuda)
-    ei_add_test(cxx11_tensor_contract_cuda)
-    ei_add_test(cxx11_tensor_of_float16_cuda)
+    ei_add_test(cxx11_tensor_gpu)
+    ei_add_test(cxx11_tensor_contract_gpu)
+    ei_add_test(cxx11_tensor_of_float16_gpu)
   endif()
 
   # The random number generation code requires arch 3.5 or greater.
-  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_cuda)
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_gpu)
   endif()
 
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+  set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+  if (EXISTS ${HIP_PATH})
+
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+    find_package(HIP REQUIRED)
+    if (HIP_FOUND)
+
+      execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+      if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+	include_directories(${CMAKE_CURRENT_BINARY_DIR})
+	include_directories(${HIP_PATH}/include)
+
+	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
+	#
+	# complex datatype is not yet supported by HIP
+	# so leaving out those tests for now
+	#
+	# ei_add_test(cxx11_tensor_complex_gpu)
+	# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+	#
+	ei_add_test(cxx11_tensor_reduction_gpu)
+	ei_add_test(cxx11_tensor_argmax_gpu)
+	ei_add_test(cxx11_tensor_cast_float16_gpu)
+	ei_add_test(cxx11_tensor_scan_gpu)
+	ei_add_test(cxx11_tensor_device)
+
+	ei_add_test(cxx11_tensor_gpu)
+	ei_add_test(cxx11_tensor_contract_gpu)
+	ei_add_test(cxx11_tensor_of_float16_gpu)
+	ei_add_test(cxx11_tensor_random_gpu)
+
+	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+      elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+      else ()
+	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+      endif()
+
+    endif()
+
+  else ()
+
+    message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+  endif()
+
+endif()
+
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
index a8cb52864..0955795b6 100644
--- a/unsupported/test/EulerAngles.cpp
+++ b/unsupported/test/EulerAngles.cpp
@@ -13,146 +13,220 @@
 
 using namespace Eigen;
 
-template<typename EulerSystem, typename Scalar>
-void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
-  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework)
+template <typename Scalar, class System>
+bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b)
+{
+  return verifyIsApprox(a.angles(), b.angles());
+}
+
+// Verify that x is in the approxed range [a, b]
+#define VERIFY_APPROXED_RANGE(a, x, b) \
+  do { \
+  VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \
+  VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \
+  } while(0)
+
+const char X = EULER_X;
+const char Y = EULER_Y;
+const char Z = EULER_Z;
+
+template<typename Scalar, class EulerSystem>
+void verify_euler(const EulerAngles<Scalar, EulerSystem>& e)
 {
   typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
   typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,3,1> Vector3;
   typedef Quaternion<Scalar> QuaternionType;
   typedef AngleAxis<Scalar> AngleAxisType;
-  using std::abs;
-  
-  Scalar alphaRangeStart, alphaRangeEnd;
-  Scalar betaRangeStart, betaRangeEnd;
-  Scalar gammaRangeStart, gammaRangeEnd;
   
-  if (positiveRangeAlpha)
-  {
-    alphaRangeStart = Scalar(0);
-    alphaRangeEnd = Scalar(2 * EIGEN_PI);
-  }
-  else
-  {
-    alphaRangeStart = -Scalar(EIGEN_PI);
-    alphaRangeEnd = Scalar(EIGEN_PI);
-  }
+  const Scalar ONE = Scalar(1);
+  const Scalar HALF_PI = Scalar(EIGEN_PI / 2);
+  const Scalar PI = Scalar(EIGEN_PI);
   
-  if (positiveRangeBeta)
-  {
-    betaRangeStart = Scalar(0);
-    betaRangeEnd = Scalar(2 * EIGEN_PI);
-  }
-  else
-  {
-    betaRangeStart = -Scalar(EIGEN_PI);
-    betaRangeEnd = Scalar(EIGEN_PI);
-  }
+  // It's very important calc the acceptable precision depending on the distance from the pole.
+  const Scalar longitudeRadius = std::abs(
+    EulerSystem::IsTaitBryan ?
+    std::cos(e.beta()) :
+    std::sin(e.beta())
+    );
+  Scalar precision = test_precision<Scalar>() / longitudeRadius;
   
-  if (positiveRangeGamma)
+  Scalar betaRangeStart, betaRangeEnd;
+  if (EulerSystem::IsTaitBryan)
   {
-    gammaRangeStart = Scalar(0);
-    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+    betaRangeStart = -HALF_PI;
+    betaRangeEnd = HALF_PI;
   }
   else
   {
-    gammaRangeStart = -Scalar(EIGEN_PI);
-    gammaRangeEnd = Scalar(EIGEN_PI);
+    if (!EulerSystem::IsBetaOpposite)
+    {
+      betaRangeStart = 0;
+      betaRangeEnd = PI;
+    }
+    else
+    {
+      betaRangeStart = -PI;
+      betaRangeEnd = 0;
+    }
   }
   
-  const int i = EulerSystem::AlphaAxisAbs - 1;
-  const int j = EulerSystem::BetaAxisAbs - 1;
-  const int k = EulerSystem::GammaAxisAbs - 1;
+  const Vector3 I_ = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J_ = EulerAnglesType::BetaAxisVector();
+  const Vector3 K_ = EulerAnglesType::GammaAxisVector();
   
-  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
-  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
-  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
-  
-  const Vector3 I = EulerAnglesType::AlphaAxisVector();
-  const Vector3 J = EulerAnglesType::BetaAxisVector();
-  const Vector3 K = EulerAnglesType::GammaAxisVector();
-  
-  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  // Is approx checks
+  VERIFY(e.isApprox(e));
+  VERIFY_IS_APPROX(e, e);
+  VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE));
+
+  const Matrix3 m(e);
+  VERIFY_IS_APPROX(Scalar(m.determinant()), ONE);
+
+  EulerAnglesType ebis(m);
   
-  Matrix3 m(e);
-  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  // When no roll(acting like polar representation), we have the best precision.
+  // One of those cases is when the Euler angles are on the pole, and because it's singular case,
+  //  the computation returns no roll.
+  if (ebis.beta() == 0)
+    precision = test_precision<Scalar>();
   
   // Check that eabis in range
-  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
-  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
-  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI);
+  VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd);
+  VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI);
+
+  const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_));
+  VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE);
+  VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix());
+  /*std::cout << "===================\n" <<
+    "e: " << e << std::endl <<
+    "eabis: " << eabis.transpose() << std::endl <<
+    "m: " << m << std::endl <<
+    "mbis: " << mbis << std::endl <<
+    "X: " << (m * Vector3::UnitX()).transpose() << std::endl <<
+    "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/
+  VERIFY(m.isApprox(mbis, precision));
+
+  // Test if ea and eabis are the same
+  // Need to check both singular and non-singular cases
+  // There are two singular cases.
+  // 1. When I==K and sin(ea(1)) == 0
+  // 2. When I!=K and cos(ea(1)) == 0
+
+  // TODO: Make this test work well, and use range saturation function.
+  /*// If I==K, and ea[1]==0, then there no unique solution.
+  // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2.
+  if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY_IS_APPROX(ea, eabis);*/
   
-  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  // Quaternions
+  const QuaternionType q(e);
+  ebis = q;
+  const QuaternionType qbis(ebis);
+  VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision));
+  //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
   
-  // Invert the relevant axes
-  eabis2[0] *= iFactor;
-  eabis2[1] *= jFactor;
-  eabis2[2] *= kFactor;
+  // A suggestion for simple product test when will be supported.
+  /*EulerAnglesType e2(PI/2, PI/2, PI/2);
+  Matrix3 m2(e2);
+  VERIFY_IS_APPROX(e*e2, m*m2);*/
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_vec(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2]));
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_vec<+A,+B,+C>(ea);
+  verify_euler_vec<+A,+B,-C>(ea);
+  verify_euler_vec<+A,-B,+C>(ea);
+  verify_euler_vec<+A,-B,-C>(ea);
   
-  // Saturate the angles to the correct range
-  if (positiveRangeAlpha && (eabis2[0] < 0))
-    eabis2[0] += Scalar(2 * EIGEN_PI);
-  if (positiveRangeBeta && (eabis2[1] < 0))
-    eabis2[1] += Scalar(2 * EIGEN_PI);
-  if (positiveRangeGamma && (eabis2[2] < 0))
-    eabis2[2] += Scalar(2 * EIGEN_PI);
+  verify_euler_vec<-A,+B,+C>(ea);
+  verify_euler_vec<-A,+B,-C>(ea);
+  verify_euler_vec<-A,-B,+C>(ea);
+  verify_euler_vec<-A,-B,-C>(ea);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_all_neg<X,Y,Z>(ea);
+  verify_euler_all_neg<X,Y,X>(ea);
+  verify_euler_all_neg<X,Z,Y>(ea);
+  verify_euler_all_neg<X,Z,X>(ea);
   
-  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  verify_euler_all_neg<Y,Z,X>(ea);
+  verify_euler_all_neg<Y,Z,Y>(ea);
+  verify_euler_all_neg<Y,X,Z>(ea);
+  verify_euler_all_neg<Y,X,Y>(ea);
   
-  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
-  VERIFY_IS_APPROX(m,  mbis);
+  verify_euler_all_neg<Z,X,Y>(ea);
+  verify_euler_all_neg<Z,X,Z>(ea);
+  verify_euler_all_neg<Z,Y,X>(ea);
+  verify_euler_all_neg<Z,Y,Z>(ea);
+}
+
+template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
+{
+  typedef Matrix<Scalar,3,1> Vector3;
+  const Scalar PI = Scalar(EIGEN_PI);
   
-  // Tests that are only relevant for no possitive range
-  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2))
   {
-    /* If I==K, and ea[1]==0, then there no unique solution. */ 
-    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
-    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
-      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
-    
-    // approx_or_less_than does not work for 0
-    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+    check_all_var(Vector3(PI/4, singularBeta, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3));
+    check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3));
+    check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI));
+    check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3)));
+    check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4)));
+    check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI));
   }
   
-  // Quaternions
-  QuaternionType q(e);
-  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
-  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
-}
-
-template<typename EulerSystem, typename Scalar>
-void verify_euler(const Matrix<Scalar,3,1>& ea)
-{
-  verify_euler_ranged<EulerSystem>(ea, false, false, false);
-  verify_euler_ranged<EulerSystem>(ea, false, false, true);
-  verify_euler_ranged<EulerSystem>(ea, false, true, false);
-  verify_euler_ranged<EulerSystem>(ea, false, true, true);
-  verify_euler_ranged<EulerSystem>(ea, true, false, false);
-  verify_euler_ranged<EulerSystem>(ea, true, false, true);
-  verify_euler_ranged<EulerSystem>(ea, true, true, false);
-  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+  // This one for sanity, it had a problem with near pole cases in float scalar.
+  check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI));
 }
 
-template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+template<typename Scalar> void eulerangles_manual()
 {
-  verify_euler<EulerSystemXYZ>(ea);
-  verify_euler<EulerSystemXYX>(ea);
-  verify_euler<EulerSystemXZY>(ea);
-  verify_euler<EulerSystemXZX>(ea);
-  
-  verify_euler<EulerSystemYZX>(ea);
-  verify_euler<EulerSystemYZY>(ea);
-  verify_euler<EulerSystemYXZ>(ea);
-  verify_euler<EulerSystemYXY>(ea);
-  
-  verify_euler<EulerSystemZXY>(ea);
-  verify_euler<EulerSystemZXZ>(ea);
-  verify_euler<EulerSystemZYX>(ea);
-  verify_euler<EulerSystemZYZ>(ea);
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar,Dynamic,1> VectorX;
+  const Vector3 Zero = Vector3::Zero();
+  const Scalar PI = Scalar(EIGEN_PI);
+  
+  check_all_var(Zero);
+  
+  // singular cases
+  check_singular_cases(PI/2);
+  check_singular_cases(-PI/2);
+  
+  check_singular_cases(Scalar(0));
+  check_singular_cases(Scalar(-0));
+  
+  check_singular_cases(PI);
+  check_singular_cases(-PI);
+  
+  // non-singular cases
+  VectorX alpha = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  VectorX beta =  VectorX::LinSpaced(20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
+  VectorX gamma = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+  for (int i = 0; i < alpha.size(); ++i) {
+    for (int j = 0; j < beta.size(); ++j) {
+      for (int k = 0; k < gamma.size(); ++k) {
+        check_all_var(Vector3(alpha(i), beta(j), gamma(k)));
+      }
+    }
+  }
 }
 
-template<typename Scalar> void eulerangles()
+template<typename Scalar> void eulerangles_rand()
 {
   typedef Matrix<Scalar,3,3> Matrix3;
   typedef Matrix<Scalar,3,1> Vector3;
@@ -199,10 +273,24 @@ template<typename Scalar> void eulerangles()
   check_all_var(ea);
 }
 
-void test_EulerAngles()
+EIGEN_DECLARE_TEST(EulerAngles)
 {
+  // Simple cast test
+  EulerAnglesXYZd onesEd(1, 1, 1);
+  EulerAnglesXYZf onesEf = onesEd.cast<float>();
+  VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
+
+  // Simple Construction from Vector3 test
+  VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
+  
+  CALL_SUBTEST_1( eulerangles_manual<float>() );
+  CALL_SUBTEST_2( eulerangles_manual<double>() );
+  
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( eulerangles<float>() );
-    CALL_SUBTEST_2( eulerangles<double>() );
+    CALL_SUBTEST_3( eulerangles_rand<float>() );
+    CALL_SUBTEST_4( eulerangles_rand<double>() );
   }
+  
+  // TODO: Add tests for auto diff
+  // TODO: Add tests for complex numbers
 }
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index 8b7528fb7..cfe559ebd 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -225,7 +225,7 @@ void test_return_by_value(int len)
     VERIFY( (in1-in).norm() < test_precision<float>() );
 }
 
-void test_FFTW()
+EIGEN_DECLARE_TEST(FFTW)
 {
   CALL_SUBTEST( test_return_by_value(32) );
   //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 1d682dd83..c667b7247 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -15,6 +15,15 @@
 // tolerance for chekcing number of iterations
 #define LM_EVAL_COUNT_TOL 4/3
 
+#define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \
+            ++g_test_level; \
+            VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \
+            VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \
+            --g_test_level; \
+            VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \
+            VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \
+        }
+
 int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
 {
     /*      subroutine fcn for chkder example. */
@@ -180,8 +189,7 @@ void testLmder1()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -209,8 +217,7 @@ void testLmder()
 
   // check return values
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -294,8 +301,7 @@ void testHybrj1()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(solver.nfev, 11);
-  VERIFY_IS_EQUAL(solver.njev, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -329,8 +335,7 @@ void testHybrj()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(solver.nfev, 11);
-  VERIFY_IS_EQUAL(solver.njev, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -485,8 +490,7 @@ void testLmstr1()
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -514,8 +518,7 @@ void testLmstr()
 
   // check return values
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -565,7 +568,7 @@ void testLmdif1()
 
   // do the computation
   lmdif_functor functor;
-  DenseIndex nfev;
+  DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning
   info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
 
   // check return value
@@ -686,8 +689,7 @@ void testNistChwirut2(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 10);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 10, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
   // check x
@@ -707,8 +709,7 @@ void testNistChwirut2(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 7);
-  VERIFY_IS_EQUAL(lm.njev, 6);
+  LM_CHECK_N_ITERS(lm, 7, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
   // check x
@@ -766,8 +767,7 @@ void testNistMisra1a(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 19);
-  VERIFY_IS_EQUAL(lm.njev, 15);
+  LM_CHECK_N_ITERS(lm, 19, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
   // check x
@@ -783,8 +783,7 @@ void testNistMisra1a(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 5);
-  VERIFY_IS_EQUAL(lm.njev, 4);
+  LM_CHECK_N_ITERS(lm, 5, 4);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
   // check x
@@ -856,8 +855,7 @@ void testNistHahn1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 11);
-  VERIFY_IS_EQUAL(lm.njev, 10);
+  LM_CHECK_N_ITERS(lm, 11, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
   // check x
@@ -878,8 +876,7 @@ void testNistHahn1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 11);
-  VERIFY_IS_EQUAL(lm.njev, 10);
+  LM_CHECK_N_ITERS(lm, 11, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
   // check x
@@ -942,8 +939,7 @@ void testNistMisra1d(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 3);
-  VERIFY_IS_EQUAL(lm.nfev, 9);
-  VERIFY_IS_EQUAL(lm.njev, 7);
+  LM_CHECK_N_ITERS(lm, 9, 7);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
   // check x
@@ -959,8 +955,7 @@ void testNistMisra1d(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 4);
-  VERIFY_IS_EQUAL(lm.njev, 3);
+  LM_CHECK_N_ITERS(lm, 4, 3);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
   // check x
@@ -1020,8 +1015,7 @@ void testNistLanczos1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 2);
-  VERIFY_IS_EQUAL(lm.nfev, 79);
-  VERIFY_IS_EQUAL(lm.njev, 72);
+  LM_CHECK_N_ITERS(lm, 79, 72);
   // check norm^2
   std::cout.precision(30);
   std::cout << lm.fvec.squaredNorm() << "\n";
@@ -1043,8 +1037,7 @@ void testNistLanczos1(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 2);
-  VERIFY_IS_EQUAL(lm.nfev, 9);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 9, 8);
   // check norm^2
   VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
@@ -1108,8 +1101,7 @@ void testNistRat42(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 10);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 10, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
   // check x
@@ -1126,8 +1118,7 @@ void testNistRat42(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 6);
-  VERIFY_IS_EQUAL(lm.njev, 5);
+  LM_CHECK_N_ITERS(lm, 6, 5);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
   // check x
@@ -1186,8 +1177,7 @@ void testNistMGH10(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  VERIFY_IS_EQUAL(lm.nfev, 284 ); 
-  VERIFY_IS_EQUAL(lm.njev, 249 ); 
+  LM_CHECK_N_ITERS(lm, 284, 249); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
   // check x
@@ -1204,8 +1194,7 @@ void testNistMGH10(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 3);
-  VERIFY_IS_EQUAL(lm.nfev, 126);
-  VERIFY_IS_EQUAL(lm.njev, 116);
+  LM_CHECK_N_ITERS(lm, 126, 116);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
   // check x
@@ -1265,8 +1254,7 @@ void testNistBoxBOD(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY(lm.nfev < 31); // 31
-  VERIFY(lm.njev < 25); // 25
+  LM_CHECK_N_ITERS(lm, 31, 25);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1284,9 +1272,8 @@ void testNistBoxBOD(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev, 15 ); 
-  VERIFY_IS_EQUAL(lm.njev, 14 ); 
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 15, 14);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1356,12 +1343,7 @@ void testNistMGH17(void)
   
   // check return value
   VERIFY_IS_EQUAL(info, 2); 
-  ++g_test_level;
-  VERIFY_IS_EQUAL(lm.nfev, 602);  // 602
-  VERIFY_IS_EQUAL(lm.njev, 545);  // 545
-  --g_test_level;
-  VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
-  VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
+  LM_CHECK_N_ITERS(lm, 602, 545);
 
   /*
    * Second try
@@ -1373,8 +1355,7 @@ void testNistMGH17(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 18);
-  VERIFY_IS_EQUAL(lm.njev, 15);
+  LM_CHECK_N_ITERS(lm, 18, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
   // check x
@@ -1438,9 +1419,8 @@ void testNistMGH09(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY_IS_EQUAL(lm.nfev, 490 ); 
-  VERIFY_IS_EQUAL(lm.njev, 376 ); 
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 490, 376);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1459,8 +1439,7 @@ void testNistMGH09(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 18);
-  VERIFY_IS_EQUAL(lm.njev, 16);
+  LM_CHECK_N_ITERS(lm, 18, 16);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1525,8 +1504,7 @@ void testNistBennett5(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 758);
-  VERIFY_IS_EQUAL(lm.njev, 744);
+  LM_CHECK_N_ITERS(lm, 758, 744);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1543,8 +1521,7 @@ void testNistBennett5(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 203);
-  VERIFY_IS_EQUAL(lm.njev, 192);
+  LM_CHECK_N_ITERS(lm, 203, 192);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1613,8 +1590,7 @@ void testNistThurber(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 39);
-  VERIFY_IS_EQUAL(lm.njev, 36);
+  LM_CHECK_N_ITERS(lm, 39,36);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1638,8 +1614,7 @@ void testNistThurber(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 29);
-  VERIFY_IS_EQUAL(lm.njev, 28);
+  LM_CHECK_N_ITERS(lm, 29, 28);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1705,8 +1680,7 @@ void testNistRat43(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 27);
-  VERIFY_IS_EQUAL(lm.njev, 20);
+  LM_CHECK_N_ITERS(lm, 27, 20);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1727,8 +1701,7 @@ void testNistRat43(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 9);
-  VERIFY_IS_EQUAL(lm.njev, 8);
+  LM_CHECK_N_ITERS(lm, 9, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1790,8 +1763,7 @@ void testNistEckerle4(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 18);
-  VERIFY_IS_EQUAL(lm.njev, 15);
+  LM_CHECK_N_ITERS(lm, 18, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
   // check x
@@ -1808,8 +1780,7 @@ void testNistEckerle4(void)
 
   // check return value
   VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 7);
-  VERIFY_IS_EQUAL(lm.njev, 6);
+  LM_CHECK_N_ITERS(lm, 7, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
   // check x
@@ -1818,7 +1789,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_NonLinearOptimization()
+EIGEN_DECLARE_TEST(NonLinearOptimization)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST/*_1*/(testChkder());
diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp
index 27d888056..6d836413b 100644
--- a/unsupported/test/NumericalDiff.cpp
+++ b/unsupported/test/NumericalDiff.cpp
@@ -24,7 +24,7 @@ struct Functor
   int m_inputs, m_values;
   
   Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  Functor(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
   
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -107,7 +107,7 @@ void test_central()
     VERIFY_IS_APPROX(jac, actual_jac);
 }
 
-void test_NumericalDiff()
+EIGEN_DECLARE_TEST(NumericalDiff)
 {
     CALL_SUBTEST(test_forward());
     CALL_SUBTEST(test_central());
diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp
index 252cb1d3f..f442e416a 100644
--- a/unsupported/test/alignedvector3.cpp
+++ b/unsupported/test/alignedvector3.cpp
@@ -70,13 +70,16 @@ void alignedvector3()
     VERIFY_IS_APPROX(f6,r1-r4);
   }
   
+  FastType f8, f9(0,0,0);
+  VERIFY_IS_APPROX(f9-f1,-f1);
+
   std::stringstream ss1, ss2;
   ss1 << f1;
   ss2 << r1;
   VERIFY(ss1.str()==ss2.str());
 }
 
-void test_alignedvector3()
+EIGEN_DECLARE_TEST(alignedvector3)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST( alignedvector3<float>() );
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 85743137e..2cea56ba5 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -44,7 +44,7 @@ struct TestFunc1
   int m_inputs, m_values;
 
   TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -306,6 +306,8 @@ double bug_1222() {
   return denom.value();
 }
 
+#ifdef EIGEN_TEST_PART_5
+
 double bug_1223() {
   using std::min;
   typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
@@ -326,8 +328,8 @@ double bug_1223() {
 
 // regression test for some compilation issues with specializations of ScalarBinaryOpTraits
 void bug_1260() {
-  Matrix4d A;
-  Vector4d v;
+  Matrix4d A = Matrix4d::Ones();
+  Vector4d v = Vector4d::Ones();
   A*v;
 }
 
@@ -336,7 +338,7 @@ double bug_1261() {
   typedef AutoDiffScalar<Matrix2d> AD;
   typedef Matrix<AD,2,1> VectorAD;
 
-  VectorAD v;
+  VectorAD v(0.,0.);
   const AD maxVal = v.maxCoeff();
   const AD minVal = v.minCoeff();
   return maxVal.value() + minVal.value();
@@ -344,13 +346,30 @@ double bug_1261() {
 
 double bug_1264() {
   typedef AutoDiffScalar<Vector2d> AD;
-  const AD s;
-  const Matrix<AD, 3, 1> v1;
+  const AD s = 0.;
+  const Matrix<AD, 3, 1> v1(0.,0.,0.);
   const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
   return v2(0).value();
 }
 
-void test_autodiff()
+// check with expressions on constants
+double bug_1281() {
+  int n = 2;
+  typedef AutoDiffScalar<VectorXd> AD;
+  const AD c = 1.;
+  AD x0(2,n,0);
+  AD y1 = (AD(c)+AD(c))*x0;
+  y1 = x0 * (AD(c)+AD(c));
+  AD y2 = (-AD(c))+x0;
+  y2 = x0+(-AD(c));
+  AD y3 = (AD(c)*(-AD(c))+AD(c))*x0;
+  y3 = x0 * (AD(c)*(-AD(c))+AD(c));
+  return (y1+y2+y3).value();
+}
+
+#endif
+
+EIGEN_DECLARE_TEST(autodiff)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( test_autodiff_scalar<1>() );
@@ -359,9 +378,10 @@ void test_autodiff()
     CALL_SUBTEST_4( test_autodiff_hessian<1>() );
   }
 
-  bug_1222();
-  bug_1223();
-  bug_1260();
-  bug_1261();
+  CALL_SUBTEST_5( bug_1222() );
+  CALL_SUBTEST_5( bug_1223() );
+  CALL_SUBTEST_5( bug_1260() );
+  CALL_SUBTEST_5( bug_1261() );
+  CALL_SUBTEST_5( bug_1281() );
 }
 
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index 9cf11280c..e81a7788b 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -81,12 +81,15 @@ void check_limits_specialization()
   typedef std::numeric_limits<AD> A;
   typedef std::numeric_limits<Scalar> B;
 
+  // workaround "unused typedef" warning:
+  VERIFY(!bool(internal::is_same<B, A>::value));
+
 #if EIGEN_HAS_CXX11
   VERIFY(bool(std::is_base_of<B, A>::value));
 #endif
 }
 
-void test_autodiff_scalar()
+EIGEN_DECLARE_TEST(autodiff_scalar)
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_atan2<float>() );
diff --git a/unsupported/test/bessel_functions.cpp b/unsupported/test/bessel_functions.cpp
new file mode 100644
index 000000000..06765bfab
--- /dev/null
+++ b/unsupported/test/bessel_functions.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i))) {
+      VERIFY_IS_APPROX( x(i), y(i) );
+    }
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_bessel_functions() 
+{
+  // Test Bessel function i0. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 4.35582826e+07, 6.21841242e+06, 8.93446228e+05, 1.29418563e+05,
+       1.89489253e+04, 2.81571663e+03, 4.27564116e+02, 6.72344070e+01,
+       1.13019220e+01, 2.27958530e+00, 1.00000000e+00, 2.27958530e+00,
+       1.13019220e+01, 6.72344070e+01, 4.27564116e+02, 2.81571663e+03,
+       1.89489253e+04, 1.29418563e+05, 8.93446228e+05, 6.21841242e+06,
+       4.35582826e+07;
+
+    CALL_SUBTEST(res = bessel_i0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << 0.0897803118848, 0.0947062952128, 0.100544127361,
+        0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+        0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+        0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+        0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+        0.0897803118848;
+
+    CALL_SUBTEST(res = bessel_i0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -4.24549734e+07, -6.04313324e+06, -8.65059436e+05, -1.24707259e+05,
+       -1.81413488e+04, -2.67098830e+03, -3.99873137e+02, -6.13419368e+01,
+       -9.75946515e+00, -1.59063685e+00,  0.00000000e+00,  1.59063685e+00,
+        9.75946515e+00,  6.13419368e+01,  3.99873137e+02,  2.67098830e+03,
+        1.81413488e+04,  1.24707259e+05,  8.65059436e+05,  6.04313324e+06,
+        4.24549734e+07;
+
+    CALL_SUBTEST(res = bessel_i1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function i1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(21);
+    ArrayType expected(21);
+    ArrayType res(21);
+
+    x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+        2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+    expected << -0.0875062221833, -0.092036796872, -0.0973496147565,
+        -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+        -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+        0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+        0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+        0.0875062221833;
+
+    CALL_SUBTEST(res = bessel_i1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j0. Reference results obtained with SciPy.
+  {
+    ArrayType x(77);
+    ArrayType expected(77);
+    ArrayType res(77);
+
+    x << -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.;
+
+    expected << 0.11433274,  0.01086237, -0.10556738,
+             -0.12684568, -0.03042119,  0.09727067,  0.13807901,  0.05120815,
+             -0.08636798, -0.14784876, -0.07315701,  0.07274192,  0.15599932,
+              0.09626678, -0.05623027, -0.16241278, -0.12065148,  0.03657907,
+              0.16702466,  0.14662944, -0.01335581, -0.16985425, -0.17489907,
+             -0.01422447,  0.17107348,  0.2069261 ,  0.04768931, -0.1711903 ,
+             -0.24593576, -0.09033361,  0.17165081,  0.30007927,  0.15064526,
+             -0.17759677, -0.39714981, -0.26005195,  0.22389078,  0.76519769,
+              1.        ,  0.76519769,  0.22389078, -0.26005195, -0.39714981,
+             -0.17759677,  0.15064526,  0.30007927,  0.17165081, -0.09033361,
+             -0.24593576, -0.1711903 ,  0.04768931,  0.2069261 ,  0.17107348,
+             -0.01422447, -0.17489907, -0.16985425, -0.01335581,  0.14662944,
+              0.16702466,  0.03657907, -0.12065148, -0.16241278, -0.05623027,
+              0.09626678,  0.15599932,  0.07274192, -0.07315701, -0.14784876,
+             -0.08636798,  0.05120815,  0.13807901,  0.09727067, -0.03042119,
+             -0.12684568, -0.10556738,  0.01086237,  0.11433274;
+
+    CALL_SUBTEST(res = bessel_j0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function j1. Reference results obtained with SciPy.
+  {
+    ArrayType x(81);
+    ArrayType expected(81);
+    ArrayType res(81);
+
+    x << -40., -39., -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+      -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+      -18., -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,
+       -7.,  -6.,  -5.,  -4.,  -3.,  -2.,  -1.,   0.,   1.,   2.,   3.,
+        4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,  12.,  13.,  14.,
+       15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,  24.,  25.,
+       26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,  35.,  36.,
+       37.,  38.,  39.,  40.;
+
+    expected << -0.12603832, -0.0640561 ,  0.05916189,  0.13058004,  0.08232981,
+             -0.04399094, -0.13297118, -0.10061965,  0.02658903,  0.13302432,
+              0.11875106, -0.0069342 , -0.13055149, -0.13658472, -0.01504573,
+              0.12535025,  0.15403807,  0.03951932, -0.11717779, -0.17112027,
+             -0.06683312,  0.10570143,  0.18799489,  0.09766849, -0.09039718,
+             -0.20510404, -0.13337515,  0.07031805,  0.2234471 ,  0.1767853 ,
+             -0.04347275, -0.24531179, -0.23463635,  0.00468282,  0.27668386,
+              0.32757914,  0.06604333, -0.33905896, -0.57672481, -0.44005059,
+              0.        ,  0.44005059,  0.57672481,  0.33905896, -0.06604333,
+             -0.32757914, -0.27668386, -0.00468282,  0.23463635,  0.24531179,
+              0.04347275, -0.1767853 , -0.2234471 , -0.07031805,  0.13337515,
+              0.20510404,  0.09039718, -0.09766849, -0.18799489, -0.10570143,
+              0.06683312,  0.17112027,  0.11717779, -0.03951932, -0.15403807,
+             -0.12535025,  0.01504573,  0.13658472,  0.13055149,  0.0069342 ,
+             -0.11875106, -0.13302432, -0.02658903,  0.10061965,  0.13297118,
+              0.04399094, -0.08232981, -0.13058004, -0.05916189,  0.0640561 ,
+              0.12603832;
+
+    CALL_SUBTEST(res = bessel_j1(x);
+                 verify_component_wise(res, expected););
+  }
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313, 0.4658451 ,
+             0.43662302, 0.41229555, 0.39163193, 0.3737955 , 0.35819488,
+             0.34439865, 0.33208364, 0.32100235, 0.31096159, 0.30180802,
+             0.29341821, 0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801, 0.24010616,
+             0.23581722, 0.23175022, 0.22788667, 0.22421014, 0.22070602,
+             0.21736123, 0.21416406, 0.21110397, 0.20817141, 0.20535778,
+             0.20265524, 0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.54150675, 0.92441907, 4.21024438e-01, 1.13893873e-01,
+             3.47395044e-02, 1.11596761e-02, 3.69109833e-03, 1.24399433e-03,
+             4.24795742e-04, 1.46470705e-04, 5.08813130e-05, 1.77800623e-05,
+             6.24302055e-06, 2.20082540e-06, 7.78454386e-07, 2.76137082e-07,
+             9.81953648e-08, 3.49941166e-08, 1.24946640e-08, 4.46875334e-09,
+             1.60067129e-09, 5.74123782e-10, 2.06176797e-10, 7.41235161e-11,
+             2.66754511e-11, 9.60881878e-12, 3.46416156e-12, 1.24987740e-12,
+             4.51286453e-13, 1.63053459e-13, 5.89495073e-14, 2.13247750e-14,
+             7.71838266e-15, 2.79505752e-15, 1.01266123e-15, 3.67057597e-16,
+             1.33103515e-16, 4.82858338e-17, 1.75232770e-17, 6.36161716e-18,
+             2.31029936e-18, 8.39286110e-19;
+
+    CALL_SUBTEST(res = bessel_k0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k0e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+             0.6977616 , 0.60929767, 0.54780756, 0.50186313,
+             0.4658451 , 0.43662302, 0.41229555, 0.39163193,
+             0.3737955 , 0.35819488, 0.34439865, 0.33208364,
+             0.32100235, 0.31096159, 0.30180802, 0.29341821,
+             0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+             0.25994703, 0.25452917, 0.2494366 , 0.24463801,
+             0.24010616, 0.23581722, 0.23175022, 0.22788667,
+             0.22421014, 0.22070602, 0.21736123, 0.21416406,
+             0.21110397, 0.20817141, 0.20535778, 0.20265524,
+             0.20005668, 0.19755558;
+
+    CALL_SUBTEST(res = bessel_k0e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 3.74702597, 1.65644112, 6.01907230e-01, 1.39865882e-01,
+             4.01564311e-02, 1.24834989e-02, 4.04461345e-03, 1.34391972e-03,
+             4.54182487e-04, 1.55369212e-04, 5.36370164e-05, 1.86487735e-05,
+             6.52086067e-06, 2.29075746e-06, 8.07858841e-07, 2.85834365e-07,
+             1.01417294e-07, 3.60715712e-08, 1.28570417e-08, 4.59124963e-09,
+             1.64226697e-09, 5.88305797e-10, 2.11029922e-10, 7.57898116e-11,
+             2.72493059e-11, 9.80699893e-12, 3.53277807e-12, 1.27369078e-12,
+             4.59568940e-13, 1.65940011e-13, 5.99574032e-14, 2.16773200e-14,
+             7.84189960e-15, 2.83839927e-15, 1.02789171e-15, 3.72416929e-16,
+             1.34991783e-16, 4.89519373e-17, 1.77585196e-17, 6.44478588e-18,
+             2.33973340e-18, 8.49713195e-19;
+
+    CALL_SUBTEST(res = bessel_k1(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function k1e. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << 4.81127659, 2.73100971, 1.63615349, 1.03347685,
+             0.80656348, 0.68157595, 0.60027386, 0.54217591,
+             0.49807158, 0.46314909, 0.43462525, 0.41076657,
+             0.39043094, 0.37283175, 0.35740757, 0.34374563,
+             0.33153489, 0.32053597, 0.31056123, 0.30146131,
+             0.29311559, 0.2854255 , 0.27830958, 0.27169987,
+             0.26553913, 0.25977879, 0.25437733, 0.249299  ,
+             0.24451285, 0.23999191, 0.2357126 , 0.23165413,
+             0.22779816, 0.22412841, 0.22063036, 0.21729103,
+             0.21409878, 0.21104314, 0.20811462, 0.20530466,
+             0.20260547, 0.20000997;
+
+    CALL_SUBTEST(res = bessel_k1e(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y0. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -0.93157302, -0.44451873, 0.08825696,  0.51037567,  0.37685001,
+             -0.01694074, -0.30851763, -0.28819468, -0.02594974,  0.22352149,
+             0.2499367 ,  0.05567117, -0.16884732, -0.22523731, -0.07820786,
+             0.12719257,  0.2054643 , 0.095811  , -0.0926372 , -0.18755216,
+             -0.10951969,  0.0626406 , 0.17020176,  0.1198876 , -0.03598179,
+             -0.15283403, -0.12724943, 0.01204463,  0.13521498,  0.13183647,
+             0.00948116, -0.11729573, -0.13383266, -0.02874248,  0.09913483,
+             0.13340405,  0.04579799, -0.08085609, -0.13071488, -0.06066076,
+             0.06262353,  0.12593642;
+
+    CALL_SUBTEST(res = bessel_y0(x);
+                 verify_component_wise(res, expected););
+  }
+
+  // Test Bessel function y1. Reference results obtained with SciPy.
+  {
+    ArrayType x(42);
+    ArrayType expected(42);
+    ArrayType res(42);
+
+    x << 0.25, 0.5,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
+       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+       39., 40.;
+
+    expected << -2.70410523, -1.47147239, -0.78121282, -0.10703243,
+             0.32467442,  0.39792571,  0.14786314, -0.17501034, -0.30266724,
+             -0.15806046,  0.10431458,  0.24901542, 0.16370554, -0.05709922,
+             -0.21008141, -0.16664484,  0.02107363, 0.17797517,  0.16720504,
+             0.00815513, -0.14956011, -0.16551161, -0.03253926,  0.12340586,
+             0.1616692 ,  0.05305978, -0.09882996, -0.15579655, -0.07025124,
+             0.07552213,  0.14803412,  0.08442557, -0.05337283, -0.13854483,
+             -0.09578012,  0.03238588,  0.12751273, 0.10445477, -0.01262946,
+             -0.11514066, -0.11056411, -0.00579351;
+
+    CALL_SUBTEST(res = bessel_y1(x);
+                 verify_component_wise(res, expected););
+  }
+}
+
+EIGEN_DECLARE_TEST(bessel_functions)
+{
+  CALL_SUBTEST_1(array_bessel_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_bessel_functions<ArrayXd>());
+}
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 3b598bf42..7bf4e965f 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -30,11 +30,11 @@ static void test_basic_eventcount()
   EventCount ec(waiters);
   EventCount::Waiter& w = waiters[0];
   ec.Notify(false);
-  ec.Prewait(&w);
+  ec.Prewait();
   ec.Notify(true);
   ec.CommitWait(&w);
-  ec.Prewait(&w);
-  ec.CancelWait(&w);
+  ec.Prewait();
+  ec.CancelWait();
 }
 
 // Fake bounded counter-based queue.
@@ -112,7 +112,7 @@ static void test_stress_eventcount()
         unsigned idx = rand_reentrant(&rnd) % kQueues;
         if (queues[idx].Pop()) continue;
         j--;
-        ec.Prewait(&w);
+        ec.Prewait();
         bool empty = true;
         for (int q = 0; q < kQueues; q++) {
           if (!queues[q].Empty()) {
@@ -121,7 +121,7 @@ static void test_stress_eventcount()
           }
         }
         if (!empty) {
-          ec.CancelWait(&w);
+          ec.CancelWait();
           continue;
         }
         ec.CommitWait(&w);
@@ -135,7 +135,7 @@ static void test_stress_eventcount()
   }
 }
 
-void test_cxx11_eventcount()
+EIGEN_DECLARE_TEST(cxx11_eventcount)
 {
   CALL_SUBTEST(test_basic_eventcount());
   CALL_SUBTEST(test_stress_eventcount());
diff --git a/unsupported/test/cxx11_maxsizevector.cpp b/unsupported/test/cxx11_maxsizevector.cpp
new file mode 100644
index 000000000..46b689a8e
--- /dev/null
+++ b/unsupported/test/cxx11_maxsizevector.cpp
@@ -0,0 +1,77 @@
+#include "main.h"
+
+#include <exception>  // std::exception
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+struct Foo
+{
+  static Index object_count;
+  static Index object_limit;
+  EIGEN_ALIGN_TO_BOUNDARY(128) int dummy;
+
+  Foo(int x=0) : dummy(x)
+  {
+#ifdef EIGEN_EXCEPTIONS
+    // TODO: Is this the correct way to handle this?
+    if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+    std::cout << '+';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+  Foo(const Foo&)
+  {
+    std::cout << 'c';
+    ++Foo::object_count;
+    eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+  }
+
+  ~Foo()
+  {
+    std::cout << '~';
+    --Foo::object_count;
+  }
+
+  class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+
+
+EIGEN_DECLARE_TEST(cxx11_maxsizevector)
+{
+  typedef MaxSizeVector<Foo> VectorX;
+  Foo::object_count = 0;
+  for(int r = 0; r < g_repeat; r++) {
+    Index rows = internal::random<Index>(3,30);
+    Foo::object_limit = internal::random<Index>(0, rows - 2);
+    std::cout << "object_limit = " << Foo::object_limit << std::endl;
+    bool exception_raised = false;
+#ifdef EIGEN_EXCEPTIONS
+    try
+    {
+#endif
+      std::cout <<       "\nVectorX m(" << rows << ");\n";
+      VectorX vect(rows);
+      for(int i=0; i<rows; ++i)
+          vect.push_back(Foo());
+#ifdef EIGEN_EXCEPTIONS
+      VERIFY(false);  // not reached if exceptions are enabled
+    }
+    catch (const Foo::Fail&) { exception_raised = true; }
+    VERIFY(exception_raised);
+#endif
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+    {
+      Foo::object_limit = rows+1;
+      VectorX vect2(rows, Foo());
+      VERIFY_IS_EQUAL(Foo::object_count, rows);
+    }
+    VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+    std::cout << '\n';
+  }
+}
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index 8911c59d8..510e11032 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -340,7 +340,7 @@ static void test_array_misc()
   VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
 }
 
-void test_cxx11_meta()
+EIGEN_DECLARE_TEST(cxx11_meta)
 {
   CALL_SUBTEST(test_gen_numeric_list());
   CALL_SUBTEST(test_concat());
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 5f9bb938b..993ee1789 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -11,22 +11,23 @@
 #define EIGEN_USE_THREADS
 #include "main.h"
 #include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"
 
 static void test_create_destroy_empty_pool()
 {
   // Just create and destroy the pool. This will wind up and tear down worker
   // threads. Ensure there are no issues in that logic.
   for (int i = 0; i < 16; ++i) {
-    NonBlockingThreadPool tp(i);
+    ThreadPool tp(i);
   }
 }
 
 
-static void test_parallelism()
+static void test_parallelism(bool allow_spinning)
 {
   // Test we never-ever fail to match available tasks with idle threads.
   const int kThreads = 16;  // code below expects that this is a multiple of 4
-  NonBlockingThreadPool tp(kThreads);
+  ThreadPool tp(kThreads, allow_spinning);
   VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
   VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
   for (int iter = 0; iter < 100; ++iter) {
@@ -100,8 +101,80 @@ static void test_parallelism()
   }
 }
 
-void test_cxx11_non_blocking_thread_pool()
+
+static void test_cancel()
+{
+  ThreadPool tp(2);
+
+  // Schedule a large number of closure that each sleeps for one second. This
+  // will keep the thread pool busy for much longer than the default test timeout.
+  for (int i = 0; i < 1000; ++i) {
+    tp.Schedule([]() {
+      std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+    });
+  }
+
+  // Cancel the processing of all the closures that are still pending.
+  tp.Cancel();
+}
+
+static void test_pool_partitions() {
+  const int kThreads = 2;
+  ThreadPool tp(kThreads);
+
+  // Assign each thread to its own partition, so that stealing other work only
+  // occurs globally when a thread is idle.
+  std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+  for (int i = 0; i < kThreads; ++i) {
+    steal_partitions[i] = std::make_pair(i, i + 1);
+  }
+  tp.SetStealPartitions(steal_partitions);
+
+  std::atomic<int> running(0);
+  std::atomic<int> done(0);
+  std::atomic<int> phase(0);
+
+  // Schedule kThreads tasks and ensure that they all are running.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.Schedule([&]() {
+      const int thread_id = tp.CurrentThreadId();
+      VERIFY_GE(thread_id, 0);
+      VERIFY_LE(thread_id, kThreads - 1);
+      ++running;
+      while (phase < 1) {
+      }
+      ++done;
+    });
+  }
+  while (running != kThreads) {
+  }
+  // Schedule each closure to only run on thread 'i' and verify that it does.
+  for (int i = 0; i < kThreads; ++i) {
+    tp.ScheduleWithHint(
+        [&, i]() {
+          ++running;
+          const int thread_id = tp.CurrentThreadId();
+          VERIFY_IS_EQUAL(thread_id, i);
+          while (phase < 2) {
+          }
+          ++done;
+        },
+        i, i + 1);
+  }
+  running = 0;
+  phase = 1;
+  while (running != kThreads) {
+  }
+  running = 0;
+  phase = 2;
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool)
 {
   CALL_SUBTEST(test_create_destroy_empty_pool());
-  CALL_SUBTEST(test_parallelism());
+  CALL_SUBTEST(test_parallelism(true));
+  CALL_SUBTEST(test_parallelism(false));
+  CALL_SUBTEST(test_cancel());
+  CALL_SUBTEST(test_pool_partitions());
 }
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 91f690114..8fc5a3074 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -227,7 +227,7 @@ void test_stress_runqueue()
   VERIFY(total.load() == 0);
 }
 
-void test_cxx11_runqueue()
+EIGEN_DECLARE_TEST(cxx11_runqueue)
 {
   CALL_SUBTEST_1(test_basic_runqueue());
   CALL_SUBTEST_2(test_empty_runqueue());
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index 037767270..4a0c8967b 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -273,7 +273,7 @@ static void test_argmin_dim()
   }
 }
 
-void test_cxx11_tensor_argmax()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax)
 {
   CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
   CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 653443dc5..79f4066e9 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -9,19 +9,18 @@
 
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 
 template <int Layout>
-void test_cuda_simple_argmax()
+void test_gpu_simple_argmax()
 {
   Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
   Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@@ -37,13 +36,13 @@ void test_cuda_simple_argmax()
   double* d_in;
   DenseIndex* d_out_max;
   DenseIndex* d_out_min;
-  cudaMalloc((void**)(&d_in), in_bytes);
-  cudaMalloc((void**)(&d_out_max), out_bytes);
-  cudaMalloc((void**)(&d_out_min), out_bytes);
+  gpuMalloc((void**)(&d_in), in_bytes);
+  gpuMalloc((void**)(&d_out_max), out_bytes);
+  gpuMalloc((void**)(&d_out_min), out_bytes);
 
-  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@@ -53,20 +52,20 @@ void test_cuda_simple_argmax()
   gpu_out_max.device(gpu_device) = gpu_in.argmax();
   gpu_out_min.device(gpu_device) = gpu_in.argmin();
 
-  assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
   VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
 
-  cudaFree(d_in);
-  cudaFree(d_out_max);
-  cudaFree(d_out_min);
+  gpuFree(d_in);
+  gpuFree(d_out_max);
+  gpuFree(d_out_min);
 }
 
 template <int DataLayout>
-void test_cuda_argmax_dim()
+void test_gpu_argmax_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -100,12 +99,12 @@ void test_cuda_argmax_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -113,8 +112,8 @@ void test_cuda_argmax_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     size_t(2*3*5*7 / tensor.dimension(dim)));
@@ -137,25 +136,25 @@ void test_cuda_argmax_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmax(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
 template <int DataLayout>
-void test_cuda_argmin_dim()
+void test_gpu_argmin_dim()
 {
   Tensor<float, 4, DataLayout> tensor(2,3,5,7);
   std::vector<int> dims;
@@ -189,12 +188,12 @@ void test_cuda_argmin_dim()
 
     float* d_in;
     DenseIndex* d_out;
-    cudaMalloc((void**)(&d_in), in_bytes);
-    cudaMalloc((void**)(&d_out), out_bytes);
+    gpuMalloc((void**)(&d_in), in_bytes);
+    gpuMalloc((void**)(&d_out), out_bytes);
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
-    Eigen::CudaStreamDevice stream;
+    Eigen::GpuStreamDevice stream;
     Eigen::GpuDevice gpu_device(&stream);
 
     Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -202,8 +201,8 @@ void test_cuda_argmin_dim()
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     VERIFY_IS_EQUAL(tensor_arg.size(),
                     2*3*5*7 / tensor.dimension(dim));
@@ -226,29 +225,29 @@ void test_cuda_argmin_dim()
       }
     }
 
-    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+    gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
 
     gpu_out.device(gpu_device) = gpu_in.argmin(dim);
 
-    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+    assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+    assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
     for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
       // Expect max to be in the last index of the reduced dimension
       VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
     }
 
-    cudaFree(d_in);
-    cudaFree(d_out);
+    gpuFree(d_in);
+    gpuFree(d_out);
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
-  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
-  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 000000000..7ac71286e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+  Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(1, 1, 1) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(DataType);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+  DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+  DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+                                                                           Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+  sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
+
+  gpu_out_max.device(sycl_device) = gpu_in.argmax();
+  gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+  sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+  sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+  VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
+  VERIFY_IS_EQUAL(out_min(), 0);
+
+  sycl_device.deallocate(d_in);
+  sycl_device.deallocate(d_out_max);
+  sycl_device.deallocate(d_out_min);
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+            // = 10.0
+            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+  DenseIndex sizeDim0 = 9;
+  DenseIndex sizeDim1 = 3;
+  DenseIndex sizeDim2 = 5;
+  DenseIndex sizeDim3 = 7;
+  Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+  std::vector<DenseIndex> dims;
+  dims.push_back(sizeDim0);
+  dims.push_back(sizeDim1);
+  dims.push_back(sizeDim2);
+  dims.push_back(sizeDim3);
+  for (DenseIndex dim = 0; dim < 4; ++dim) {
+    array<DenseIndex, 3> out_shape;
+    for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+    Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(DataType);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+    DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+    Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+        d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+                    size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    sycl_device.synchronize();
+
+    for (DenseIndex i = 0; i < sizeDim0; ++i) {
+      for (DenseIndex j = 0; j < sizeDim1; ++j) {
+        for (DenseIndex k = 0; k < sizeDim2; ++k) {
+          for (DenseIndex l = 0; l < sizeDim3; ++l) {
+            ix[0] = i;
+            ix[1] = j;
+            ix[2] = k;
+            ix[3] = l;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+          }
+        }
+      }
+    }
+
+    sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+    gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+    sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+    sycl_device.deallocate(d_in);
+    sycl_device.deallocate(d_out);
+  }
+}
+
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index 8fe85d83c..ce9d24369 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -358,7 +358,7 @@ static void test_std_initializers_tensor() {
 #endif  // EIGEN_HAS_VARIADIC_TEMPLATES
 }
 
-void test_cxx11_tensor_assign()
+EIGEN_DECLARE_TEST(cxx11_tensor_assign)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
new file mode 100644
index 000000000..5fb12e0e0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -0,0 +1,576 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <algorithm>
+#include <set>
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TensorBlockShapeType;
+
+static TensorOpCost zeroCost() { return {0, 0, 0}; }
+
+template<typename T>
+static const T& choose(int layout, const T& col, const T& row) {
+  return layout == ColMajor ? col : row;
+}
+
+static TensorBlockShapeType RandomShape() {
+  return internal::random<bool>()
+         ? TensorBlockShapeType::kUniformAllDims
+         : TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(1, 20);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+template <typename T>
+static T* GenerateRandomData(const Index& size) {
+  T* data = new T[size];
+  for (int i = 0; i < size; ++i) {
+    data[i] = internal::random<T>();
+  }
+  return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+  for (int i = 0; i < NumDims; ++i) {
+    std::cout << dims[i] << "; ";
+  }
+  std::cout << std::endl;
+}
+
+template <int Layout>
+static void test_block_mapper_sanity()
+{
+  typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+  DSizes<Index, 2> tensor_dims(100, 100);
+
+  // Test uniform blocks.
+  TensorBlockMapper uniform_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
+
+  // 10x10 blocks
+  auto uniform_b0 = uniform_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(0), 10);
+  VERIFY_IS_EQUAL(uniform_b0.dimensions().at(1), 10);
+
+  // Test skewed to inner dims blocks.
+  TensorBlockMapper skewed_block_mapper(
+      tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()});
+
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
+  VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
+
+  // 1x100 (100x1) rows/cols depending on a tensor layout.
+  auto skewed_b0 = skewed_block_mapper.blockDescriptor(0);
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(0), choose(Layout, 100, 1));
+  VERIFY_IS_EQUAL(skewed_b0.dimensions().at(1), choose(Layout, 1, 100));
+}
+
+// Given a TensorBlock "visit" every element accessible though it, and a keep an
+// index in the visited set. Verify that every coeff accessed only once.
+template<int NumDims, int Layout>
+static void UpdateCoeffSet(
+    const DSizes<Index, NumDims>& tensor_strides,
+    const internal::TensorBlockDescriptor<NumDims>& block,
+    Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
+  const DSizes<Index, NumDims>& block_sizes = block.dimensions();
+
+  for (int i = 0; i < block_sizes[dim_index]; ++i) {
+    if (tensor_strides[dim_index] == 1) {
+      typedef std::pair<std::set<Index>::iterator, bool> ReturnType;
+      ReturnType inserted = visited_coeffs->insert(first_coeff_index + i);
+      VERIFY_IS_EQUAL(inserted.second, true);
+    } else {
+      int next_dim_index = dim_index + choose(Layout, -1, 1);
+      UpdateCoeffSet<NumDims, Layout>(tensor_strides, block, first_coeff_index,
+                                         next_dim_index, visited_coeffs);
+      first_coeff_index += tensor_strides[dim_index];
+    }
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+  typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>();
+  DSizes<Index, NumDims> strides = internal::strides<Layout>(dims);
+
+  // Keep track of elements indices available via block access.
+  std::set<Index> coeff_set;
+
+  // Try different combinations of block types and sizes.
+  TensorBlockMapper block_mapper(
+      dims, {RandomShape(), RandomTargetSize(dims), zeroCost()});
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto block = block_mapper.blockDescriptor(i);
+    UpdateCoeffSet<NumDims, Layout>(strides, block, block.offset(),
+                                    choose(Layout, NumDims - 1, 0),
+                                    &coeff_set);
+  }
+
+  // Verify that every coefficient in the original Tensor is accessible through
+  // TensorBlock only once.
+  Index total_coeffs = dims.TotalSize();
+  VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
+  VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+  VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                         const array<Index, NumDims>& output_to_input_dim_map,
+                         const array<Index, NumDims>& input_strides,
+                         const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <int Layout, int NumDims>
+static array<Index, NumDims> ComputeStrides(
+    const array<Index, NumDims>& sizes) {
+  array<Index, NumDims> strides;
+  if (Layout == ColMajor) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * sizes[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+  }
+  return strides;
+}
+
+template<typename Scalar, typename StorageIndex, int Dim>
+class EqualityChecker
+{
+    const Scalar* input_data;
+    const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides;
+    void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const
+    {
+        if(depth==Dim)
+        {
+            VERIFY_IS_EQUAL(*input, *output);
+            return;
+        }
+
+        for(int i=0; i<output_dims[depth]; ++i)
+        {
+            check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1);
+        }
+    }
+public:
+    EqualityChecker(const Scalar* input_data_,
+            const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_,
+            const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_)
+        : input_data(input_data_)
+        , input_dims(input_dims_), input_strides(input_strides_)
+        , output_dims(output_dims_), output_strides(output_strides_)
+        {}
+
+    void operator()(const Scalar* output_data) const
+    {
+        check_recursive(input_data, output_data);
+    }
+};
+
+template <int Layout>
+static void test_uniform_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  {
+    // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    for (int i = 0; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // partially into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first inner-most dimension.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+  // fully into first few inner-most dimensions.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'UniformAllDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+    const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
+    TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+                                          max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(9, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_skewed_inner_dim_block_shape()
+{
+  typedef internal::TensorBlockDescriptor<5> TensorBlock;
+  typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+  // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(10, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    for (int i = 1; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    for (int i = 3; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to second inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(3, block.dimensions()[1]);
+    for (int i = 2; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(15, block.dimensions()[3]);
+    for (int i = 2; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+  // and partial allocation to third inner-dim.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 3; i < 5; ++i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+    for (int i = 1; i >= 0; --i) {
+      VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+    }
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+
+  // Test shape 'SkewedInnerDims' with full allocation to all dims.
+  if (Layout == ColMajor) {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  } else {
+    DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+    const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+    TensorBlockMapper block_mapper(
+        dims,
+        {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+    TensorBlock block = block_mapper.blockDescriptor(0);
+    VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+    VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+    VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+    VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+    VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+    VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+  }
+}
+
+template <int Layout>
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
+{
+  // Test blocking of tensors with zero dimensions:
+  //  - we must not crash on asserts and divisions by zero
+  //  - we must not return block with zero dimensions
+  //    (recipe for overflows/underflows, divisions by zero and NaNs later)
+  //  - total block count must be zero
+  {
+    typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper;
+
+    DSizes<Index, 1> dims(0);
+    for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+      TensorBlockMapper block_mapper(
+          dims, {block_shape, max_coeff_count, zeroCost()});
+      VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+      VERIFY(block_mapper.blockTotalSize() >= 1);
+    }
+  }
+
+  {
+    typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+    for (int dim1 = 0; dim1 < 3; ++dim1) {
+      for (int dim2 = 0; dim2 < 3; ++dim2) {
+        DSizes<Index, 2> dims(dim1, dim2);
+        for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+          TensorBlockMapper block_mapper(
+              dims, {block_shape, max_coeff_count, zeroCost()});
+          if (dim1 * dim2 == 0) {
+            VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+          }
+          VERIFY(block_mapper.blockTotalSize() >= 1);
+        }
+      }
+    }
+  }
+}
+
+#define TEST_LAYOUTS(NAME) \
+  CALL_SUBTEST(NAME<ColMajor>()); \
+  CALL_SUBTEST(NAME<RowMajor>())
+
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME)    \
+  CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+  CALL_SUBTEST(NAME<ColMajor>(ARG)); \
+  CALL_SUBTEST(NAME<RowMajor>(ARG))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
+  TEST_LAYOUTS(test_block_mapper_sanity);
+  TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+  TEST_LAYOUTS(test_uniform_block_shape);
+  TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+  TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+}
+
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
new file mode 100644
index 000000000..b2e26ebb7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -0,0 +1,858 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+using Eigen::internal::TensorBlockDescriptor;
+using Eigen::internal::TensorExecutor;
+
+// -------------------------------------------------------------------------- //
+// Utility functions to generate random tensors, blocks, and evaluate them.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+// Block offsets and extents allows to construct a TensorSlicingOp corresponding
+// to a TensorBlockDescriptor.
+template <int NumDims>
+struct TensorBlockParams {
+  DSizes<Index, NumDims> offsets;
+  DSizes<Index, NumDims> sizes;
+  TensorBlockDescriptor<NumDims, Index> desc;
+};
+
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
+                                              Index min, Index max) {
+  // Choose random offsets and sizes along all tensor dimensions.
+  DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max));
+  DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max));
+
+  // Make sure that offset + size do not overflow dims.
+  for (int i = 0; i < NumDims; ++i) {
+    offsets[i] = numext::mini(dims[i] - 1, offsets[i]);
+    sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]);
+  }
+
+  Index offset = 0;
+  DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims);
+  for (int i = 0; i < NumDims; ++i) {
+    offset += strides[i] * offsets[i];
+  }
+
+  return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)};
+}
+
+// Generate block with block sizes skewed towards inner dimensions. This type of
+// block is required for evaluating broadcast expressions.
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> SkewedInnerBlock(
+    DSizes<Index, NumDims> dims) {
+  using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
+  BlockMapper block_mapper(dims,
+                           {internal::TensorBlockShapeType::kSkewedInnerDims,
+                            internal::random<size_t>(1, dims.TotalSize()),
+                            {0, 0, 0}});
+
+  Index total_blocks = block_mapper.blockCount();
+  Index block_index = internal::random<Index>(0, total_blocks - 1);
+  auto block = block_mapper.blockDescriptor(block_index);
+  DSizes<Index, NumDims> sizes = block.dimensions();
+
+  auto strides = internal::strides<Layout>(dims);
+  DSizes<Index, NumDims> offsets;
+
+  // Compute offsets for the first block coefficient.
+  Index index = block.offset();
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[0] = index;
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / strides[i];
+      index -= idx * strides[i];
+      offsets[i] = idx;
+    }
+    if (NumDims > 0) offsets[NumDims - 1] = index;
+  }
+
+  return {offsets, sizes, block};
+}
+
+template <int NumDims>
+static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
+  DSizes<Index, NumDims> offsets;
+  for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+
+  return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
+}
+
+inline Eigen::IndexList<Index, Eigen::type2index<1>> NByOne(Index n) {
+  Eigen::IndexList<Index, Eigen::type2index<1>> ret;
+  ret.set(0, n);
+  return ret;
+}
+inline Eigen::IndexList<Eigen::type2index<1>, Index> OneByM(Index m) {
+  Eigen::IndexList<Eigen::type2index<1>, Index> ret;
+  ret.set(1, m);
+  return ret;
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that block expression evaluation produces the same result as a
+// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
+
+template <typename T, int NumDims, int Layout, typename Expression,
+          typename GenBlockParams>
+static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // Scratch memory allocator for block evaluation.
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  TensorBlockScratch scratch(d);
+
+  // TensorEvaluator is needed to produce tensor blocks of the expression.
+  auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
+  eval.evalSubExprsIfNeeded(nullptr);
+
+  // Choose a random offsets, sizes and TensorBlockDescriptor.
+  TensorBlockParams<NumDims> block_params = gen_block();
+
+  // Evaluate TensorBlock expression into a tensor.
+  Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
+
+  // Dimensions for the potential destination buffer.
+  DSizes<Index, NumDims> dst_dims;
+  if (internal::random<bool>()) {
+    dst_dims = block_params.desc.dimensions();
+  } else {
+    for (int i = 0; i < NumDims; ++i) {
+      Index extent = internal::random<Index>(0, 5);
+      dst_dims[i] = block_params.desc.dimension(i) + extent;
+    }
+  }
+
+  // Maybe use this tensor as a block desc destination.
+  Tensor<T, NumDims, Layout> dst(dst_dims);
+  dst.setZero();
+  if (internal::random<bool>()) {
+    block_params.desc.template AddDestinationBuffer<Layout>(
+        dst.data(), internal::strides<Layout>(dst.dimensions()));
+  }
+
+  const bool root_of_expr = internal::random<bool>();
+  auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr);
+
+  if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
+    // Copy data from destination buffer.
+    if (dimensions_match(dst.dimensions(), block.dimensions())) {
+      block = dst;
+    } else {
+      DSizes<Index, NumDims> offsets;
+      for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+      block = dst.slice(offsets, block.dimensions());
+    }
+
+  } else {
+    // Assign to block from expression.
+    auto b_expr = tensor_block.expr();
+
+    // We explicitly disable vectorization and tiling, to run a simple coefficient
+    // wise assignment loop, because it's very simple and should be correct.
+    using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
+    using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
+                                         internal::TiledEvaluation::Off>;
+    BlockExecutor::run(BlockAssign(block, b_expr), d);
+  }
+
+  // Cleanup temporary buffers owned by a tensor block.
+  tensor_block.cleanup();
+
+  // Compute a Tensor slice corresponding to a Tensor block.
+  Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions());
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(slice, s_expr), d);
+
+  // Tensor block and tensor slice must be the same.
+  for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Identity tensor expression transformation.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_with_unary_expr_block() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      (lhs.square() + rhs.square()).sqrt(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_broadcast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5);
+
+  DSizes<Index, NumDims> bcasted_dims;
+  for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i];
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return RandomBlock<Layout>(bcasted_dims, 5, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast),
+      [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); });
+
+  // Check that desc.destination() memory is not shared between two broadcast
+  // materializations.
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.broadcast(bcast) * input.abs().broadcast(bcast),
+      [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_cast() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.template cast<int>().template cast<T>(),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_select() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> lhs(dims);
+  Tensor<T, NumDims, Layout> rhs(dims);
+  Tensor<bool, NumDims, Layout> cond(dims);
+  lhs.setRandom();
+  rhs.setRandom();
+  cond.setRandom();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 20);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_padding() {
+  const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4);
+  DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4);
+  array<std::pair<Index, Index>, NumDims> paddings;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = std::make_pair(pad_before[i], pad_after[i]);
+  }
+
+  // Test squeezing reads from inner dim.
+  if (internal::random<bool>()) {
+    pad_before[inner_dim] = 0;
+    pad_after[inner_dim] = 0;
+    paddings[inner_dim] = std::make_pair(0, 0);
+  }
+
+  DSizes<Index, NumDims> padded_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    padded_dims[i] = dims[i] + pad_before[i] + pad_after[i];
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return FixedSizeBlock(padded_dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.pad(paddings),
+      [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  // Block buffer forwarding.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  // Block expression assignment.
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+      input.abs().chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+}
+
+
+template<typename T, int NumDims>
+struct SimpleTensorGenerator {
+  T operator()(const array<Index, NumDims>& coords) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * coords[i]);
+    }
+    return result;
+  }
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<int NumDims>
+struct SimpleTensorGenerator<bool, NumDims> {
+  bool operator()(const array<Index, NumDims>& coords) const {
+    bool result = false;
+    for (int i = 0; i < NumDims; ++i) {
+      result ^= coords[i];
+    }
+    return result;
+  }
+};
+
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_generator() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  auto generator = SimpleTensorGenerator<T, NumDims>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.generate(generator),
+      [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reverse() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Randomly reverse dimensions.
+  Eigen::DSizes<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
+    return RandomBlock<Layout>(dims, 1, 10);
+  });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Pick a random slice of an input tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(
+      input.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockEvaluator<T, NumDims, Layout>(
+        input.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+    break;
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(1, dim);
+  Tensor<T, 2, Layout> rhs(dim, 1);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto reshapeLhs = NByOne(dim);
+  auto reshapeRhs = OneByM(dim);
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      lhs.reshape(reshapeLhs).broadcast(bcastLhs) *
+          rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_forced_eval() {
+  Index dim = internal::random<Index>(1, 100);
+
+  Tensor<T, 2, Layout> lhs(dim, 1);
+  Tensor<T, 2, Layout> rhs(1, dim);
+  lhs.setRandom();
+  rhs.setRandom();
+
+  auto bcastLhs = OneByM(dim);
+  auto bcastRhs = NByOne(dim);
+
+  DSizes<Index, 2> dims(dim, dim);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+      [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_chipping_of_bcast() {
+  if (Layout != static_cast<int>(RowMajor)) return;
+
+  Index dim0 = internal::random<Index>(1, 10);
+  Index dim1 = internal::random<Index>(1, 10);
+  Index dim2 = internal::random<Index>(1, 10);
+
+  Tensor<T, 3, Layout> input(1, dim1, dim2);
+  input.setRandom();
+
+  Eigen::array<Index, 3> bcast = {{dim0, 1, 1}};
+  DSizes<Index, 2> chipped_dims(dim0, dim2);
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return SkewedInnerBlock<Layout, 2>(chipped_dims); });
+
+  VerifyBlockEvaluator<T, 2, Layout>(
+      input.broadcast(bcast).chip(0, 1),
+      [chipped_dims]() { return RandomBlock<Layout, 2>(chipped_dims, 1, 5); });
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that assigning block to a Tensor expression produces the same result
+// as an assignment to TensorSliceOp (writing a block is is identical to
+// assigning one tensor to a slice of another tensor).
+
+template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
+          typename Expression, typename GenBlockParams>
+static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
+                                  Expression expr, GenBlockParams gen_block) {
+  using Device = DefaultDevice;
+  auto d = Device();
+
+  // We use tensor evaluator as a target for block and slice assignments.
+  auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
+
+  // Generate a random block, or choose a block that fits in full expression.
+  TensorBlockParams<NumExprDims> block_params = gen_block();
+
+  // Generate random data of the selected block size.
+  Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
+  block.setRandom();
+
+  // ************************************************************************ //
+  // (1) Assignment from a block.
+
+  // Construct a materialize block from a random generated block tensor.
+  internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
+      internal::TensorBlockKind::kView, block.data(), block.dimensions());
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Use evaluator to write block into a tensor.
+  eval.writeBlock(block_params.desc, blk);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> block_assigned = tensor;
+
+  // ************************************************************************ //
+  // (2) Assignment to a slice
+
+  // Reset all underlying tensor values to zero.
+  tensor.setZero();
+
+  // Assign block to a slice of original expression
+  auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+  // Explicitly use coefficient assignment to evaluate slice expression.
+  using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>;
+  using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+                                       internal::TiledEvaluation::Off>;
+  SliceExecutor::run(SliceAssign(s_expr, block), d);
+
+  // Make a copy of the result after assignment.
+  Tensor<T, NumDims, Layout> slice_assigned = tensor;
+
+  for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i));
+  }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map, [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_reshape() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  DSizes<Index, NumDims> shuffled = dims;
+  std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.reshape(shuffled),
+      [&shuffled]() { return FixedSizeBlock(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_chipping() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  Index chip_dim = internal::random<int>(0, NumDims - 1);
+  Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+  DSizes<Index, NumDims - 1> chipped_dims;
+  for (Index i = 0; i < chip_dim; ++i) {
+    chipped_dims[i] = dims[i];
+  }
+  for (Index i = chip_dim + 1; i < NumDims; ++i) {
+    chipped_dims[i - 1] = dims[i];
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
+
+  VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+      tensor, map.chip(chip_offset, chip_dim),
+      [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_slice() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  // Pick a random slice of tensor.
+  DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+  DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
+
+  VerifyBlockAssignment<T, NumDims, Layout>(
+      tensor, map.slice(slice_start, slice_size),
+      [&slice_size]() { return FixedSizeBlock(slice_size); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_shuffle() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+  Tensor<T, NumDims, Layout> tensor(dims);
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle),
+        [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+    VerifyBlockAssignment<T, NumDims, Layout>(
+        tensor, map.shuffle(shuffle), [&shuffled_dims]() {
+          return RandomBlock<Layout>(shuffled_dims, 1, 5);
+        });
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME)           \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME)     \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>()));  \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
+  // clang-format off
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle);
+
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval);
+  CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast);
+
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
+  // clang-format on
+}
diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
new file mode 100644
index 000000000..52f7dde9b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_io.cpp
@@ -0,0 +1,445 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+// -------------------------------------------------------------------------- //
+// A set of tests for TensorBlockIO: copying data between tensor blocks.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+  DSizes<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<Index>(min, max);
+  }
+  return DSizes<Index, NumDims>(dims);
+}
+
+static internal::TensorBlockShapeType RandomBlockShape() {
+  return internal::random<bool>()
+         ? internal::TensorBlockShapeType::kUniformAllDims
+         : internal::TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) {
+  return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+                           const array<Index, NumDims>& output_to_input_dim_map,
+                           const array<Index, NumDims>& input_strides,
+                           const array<Index, NumDims>& output_strides) {
+  int input_index = 0;
+  if (Layout == ColMajor) {
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[0]];
+  } else {
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = output_index / output_strides[i];
+      input_index += idx * input_strides[output_to_input_dim_map[i]];
+      output_index -= idx * output_strides[i];
+    }
+    return input_index +
+           output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+  using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(dims);
+
+  // Construct a tensor block mapper.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(
+      dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (int i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, desc.offset());
+
+      TensorBlockIO::Copy(dst, src);
+    }
+
+    {
+      // Write from block buffer to output.
+      IODst dst(blk_dims, output_strides, output_data, desc.offset());
+      IOSrc src(blk_strides, block_data, 0);
+
+      TensorBlockIO::Copy(dst, src);
+    }
+  }
+
+  for (int i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_using_reordered_dimensions() {
+  // Generate a random input Tensor.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // Create a random dimension re-ordering/shuffle.
+  std::vector<int> shuffle;
+
+  for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed));
+
+  DSizes<Index, NumDims> output_tensor_dims;
+  DSizes<Index, NumDims> input_to_output_dim_map;
+  DSizes<Index, NumDims> output_to_input_dim_map;
+  for (Index i = 0; i < NumDims; ++i) {
+    output_tensor_dims[shuffle[i]] = dims[i];
+    input_to_output_dim_map[i] = shuffle[i];
+    output_to_input_dim_map[shuffle[i]] = i;
+  }
+
+  // Write data to an output Tensor.
+  Tensor<T, NumDims, Layout> output(output_tensor_dims);
+
+  // Construct a tensor block mapper.
+  // NOTE: Tensor block mapper works with shuffled dimensions.
+  using TensorBlockMapper =
+      internal::TensorBlockMapper<NumDims, Layout, Index>;
+  TensorBlockMapper block_mapper(output_tensor_dims,
+                                 {RandomBlockShape(),
+                                  RandomTargetBlockSize(output_tensor_dims),
+                                  {0, 0, 0}});
+
+  // We will copy data from input to output through this buffer.
+  Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+  // Precompute strides for TensorBlockIO::Copy.
+  auto input_strides = internal::strides<Layout>(dims);
+  auto output_strides = internal::strides<Layout>(output_tensor_dims);
+
+  const T* input_data = input.data();
+  T* output_data = output.data();
+  T* block_data = block.data();
+
+  for (Index i = 0; i < block_mapper.blockCount(); ++i) {
+    auto desc = block_mapper.blockDescriptor(i);
+
+    const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
+        desc.offset(), output_to_input_dim_map, input_strides,
+        output_strides);
+
+    // NOTE: Block dimensions are in the same order as output dimensions.
+
+    using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+    using IODst = typename TensorBlockIO::Dst;
+    using IOSrc = typename TensorBlockIO::Src;
+
+    auto blk_dims = desc.dimensions();
+    auto blk_strides = internal::strides<Layout>(blk_dims);
+
+    {
+      // Read from input into a block buffer.
+      IODst dst(blk_dims, blk_strides, block_data, 0);
+      IOSrc src(input_strides, input_data, first_coeff_index);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(output_to_input_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+
+    {
+      // We need to convert block dimensions from output to input order.
+      auto dst_dims = blk_dims;
+      for (int out_dim = 0; out_dim < NumDims; ++out_dim) {
+        dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim];
+      }
+
+      // Write from block buffer to output.
+      IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
+      IOSrc src(blk_strides, block_data, 0);
+
+      // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+      DSizes<int, NumDims> dim_map;
+      for (int j = 0; j < NumDims; ++j)
+        dim_map[j] = static_cast<int>(input_to_output_dim_map[j]);
+      TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+    }
+  }
+
+  for (Index i = 0; i < dims.TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads along inner dimensions
+// in this case is illegal, because we reorder innermost dimension.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
+  DSizes<Index, 3> tensor_dims(7, 9, 7);
+  DSizes<Index, 3> block_dims = tensor_dims;
+
+  DSizes<int, 3> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 2;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 0;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 3, Layout> block(block_dims);
+  Tensor<float, 3, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        float block_value = block_tensor(d2, d1, d0);
+        float tensor_value = tensor_tensor(d0, d1, d2);
+        VERIFY_IS_EQUAL(block_value, tensor_value);
+      }
+    }
+  }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads in this case is allowed
+// because we reorder outer dimensions.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_squeeze() {
+  DSizes<Index, 4> tensor_dims(7, 5, 9, 9);
+  DSizes<Index, 4> block_dims = tensor_dims;
+
+  DSizes<int, 4> block_to_tensor_dim;
+  block_to_tensor_dim[0] = 0;
+  block_to_tensor_dim[1] = 1;
+  block_to_tensor_dim[2] = 3;
+  block_to_tensor_dim[3] = 2;
+
+  auto tensor_strides = internal::strides<Layout>(tensor_dims);
+  auto block_strides = internal::strides<Layout>(block_dims);
+
+  Tensor<float, 4, Layout> block(block_dims);
+  Tensor<float, 4, Layout> tensor(tensor_dims);
+  tensor.setRandom();
+
+  float* tensor_data = tensor.data();
+  float* block_data = block.data();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Read from a tensor into a block.
+  IODst dst(block_dims, block_strides, block_data, 0);
+  IOSrc src(tensor_strides, tensor_data, 0);
+
+  TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+  TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
+  TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+  for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+    for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+      for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+        for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
+          float block_value = block_tensor(d0, d1, d3, d2);
+          float tensor_value = tensor_tensor(d0, d1, d2, d3);
+          VERIFY_IS_EQUAL(block_value, tensor_value);
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_zero_stride() {
+  DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30);
+
+  DSizes<Index, 5> input_tensor_dims = rnd_dims;
+  input_tensor_dims[0] = 1;
+  input_tensor_dims[2] = 1;
+  input_tensor_dims[4] = 1;
+
+  Tensor<float, 5, Layout> input(input_tensor_dims);
+  input.setRandom();
+
+  DSizes<Index, 5> output_tensor_dims = rnd_dims;
+
+  auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims);
+  auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims);
+
+  auto input_tensor_strides_with_zeros = input_tensor_strides;
+  input_tensor_strides_with_zeros[0] = 0;
+  input_tensor_strides_with_zeros[2] = 0;
+  input_tensor_strides_with_zeros[4] = 0;
+
+  Tensor<float, 5, Layout> output(output_tensor_dims);
+  output.setRandom();
+
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Write data from input to output with broadcasting in dims [0, 2, 4].
+  IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0);
+  IOSrc src(input_tensor_strides_with_zeros, input.data(), 0);
+  TensorBlockIO::Copy(dst, src);
+
+  for (int i = 0; i < output_tensor_dims[0]; ++i) {
+    for (int j = 0; j < output_tensor_dims[1]; ++j) {
+      for (int k = 0; k < output_tensor_dims[2]; ++k) {
+        for (int l = 0; l < output_tensor_dims[3]; ++l) {
+          for (int m = 0; m < output_tensor_dims[4]; ++m) {
+            float input_value = input(0, j, 0, l, 0);
+            float output_value = output(i, j, k, l, m);
+            VERIFY_IS_EQUAL(input_value, output_value);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int Layout>
+static void test_block_io_squeeze_ones() {
+  using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+  using IODst = typename TensorBlockIO::Dst;
+  using IOSrc = typename TensorBlockIO::Src;
+
+  // Total size > 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+
+  // Total size == 1.
+  {
+    DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
+    auto strides = internal::strides<Layout>(block_sizes);
+
+    // Create a random input tensor.
+    Tensor<float, 5> input(block_sizes);
+    input.setRandom();
+
+    Tensor<float, 5> output(block_sizes);
+
+    IODst dst(block_sizes, strides, output.data(), 0);
+    IOSrc src(strides, input.data());
+    TensorBlockIO::Copy(dst, src);
+
+    for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+    }
+  }
+}
+
+#define CALL_SUBTESTS(NAME)                   \
+  CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<float, 5, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \
+  CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \
+  CALL_SUBTEST((NAME<bool, 5, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
+  // clang-format off
+  CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target);
+  CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions);
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>());
+  CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_zero_stride<RowMajor>());
+  CALL_SUBTEST(test_block_io_zero_stride<ColMajor>());
+
+  CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>());
+  CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>());
+  // clang-format on
+}
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 7201bfe37..20f84b8e0 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -13,8 +13,8 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
@@ -25,50 +25,120 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){
 
   // BROADCAST test:
-  array<int, 4> in_range   = {{2, 3, 5, 7}};
-  array<int, 4> broadcasts = {{2, 3, 1, 4}};
-  array<int, 4> out_range;  // = in_range * broadcasts
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
   for (size_t i = 0; i < out_range.size(); ++i)
     out_range[i] = in_range[i] * broadcasts[i];
 
-  Tensor<float, 4>  input(in_range);
-  Tensor<float, 4> out(out_range);
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
 
   for (size_t i = 0; i < in_range.size(); ++i)
     VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
 
 
-  for (int i = 0; i < input.size(); ++i)
-    input(i) = static_cast<float>(i);
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
 
-  float * gpu_in_data  = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data  = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  TensorMap<Tensor<float, 4>>  gpu_in(gpu_in_data, in_range);
-  TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
-  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
+  TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
   gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
 
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 9; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 28; ++l) {
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
           VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
         }
       }
     }
   }
+  printf("Broadcast Test with fixed size Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  IndexType inDim1=2;
+  IndexType inDim2=3;
+  IndexType inDim3=5;
+  IndexType inDim4=7;
+  IndexType bDim1=2;
+  IndexType bDim2=3;
+  IndexType bDim3=1;
+  IndexType bDim4=4;
+  array<IndexType, 4> in_range   = {{inDim1, inDim2, inDim3, inDim4}};
+  array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+  array<IndexType, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<DataType, 4, DataLayout, IndexType>  input(in_range);
+  Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (IndexType i = 0; i < input.size(); ++i)
+    input(i) = static_cast<DataType>(i);
+
+  DataType * gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data  = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>>  gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+    for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+      for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+        for (IndexType l = 0; l < inDim4*bDim4; ++l) {
+          VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l));
+        }
+      }
+    }
+  }
   printf("Broadcast Test Passed\n");
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-void test_cxx11_tensor_broadcast_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_broadcast_sycl(sycl_device));
+template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
+  test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 5c0ea5889..d3dab891f 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
     }
   }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
   tensor.setRandom();
   broadcast = tensor.broadcast(broadcasts);
 
@@ -115,7 +124,7 @@ static void test_static_broadcasting()
   Tensor<float, 3, DataLayout> tensor(8,3,5);
   tensor.setRandom();
 
-#if EIGEN_HAS_CONSTEXPR
+#if defined(EIGEN_HAS_INDEX_LIST)
   Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
 #else
   Eigen::array<int, 3> broadcasts;
@@ -139,7 +148,16 @@ static void test_static_broadcasting()
     }
   }
 
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   tensor.resize(11,3,5);
+#else
+  array<Index, 3> new_dims;
+  new_dims[0] = 11;
+  new_dims[1] = 3;
+  new_dims[2] = 5;
+  tensor.resize(new_dims);
+#endif
+
   tensor.setRandom();
   broadcast = tensor.broadcast(broadcasts);
 
@@ -180,8 +198,119 @@ static void test_fixed_size_broadcasting()
 #endif
 }
 
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+  Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 9;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 9; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+  Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+  Tensor<float, 3, DataLayout> tensor(1,7,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 13;
+  Tensor<float, 3, DataLayout> broadcasted;
+  broadcasted = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+  Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 5;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 13; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
 
-void test_cxx11_tensor_broadcasting()
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
 {
   CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
@@ -191,4 +320,12 @@ void test_cxx11_tensor_broadcasting()
   CALL_SUBTEST(test_static_broadcasting<RowMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
new file mode 100644
index 000000000..72cb62fd5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+template <typename T> T square(T x) { return x * x; }
+template <typename T> T cube(T x) { return x * x * x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+}
+}
+
+struct EqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+  template <typename Lhs, typename Rhs>
+  void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
+
+template <typename DataType, int DataLayout,
+          typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+                                    const array<int64_t, 3>& tensor_range) {
+  Operator op;
+  Assignement asgn;
+  {
+    /* Assignement(out, Operator(in)) */
+    Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    in = in.random() + DataType(0.01);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data = static_cast<DataType *>(
+        sycl_device.allocate(in.size() * sizeof(DataType)));
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                   (in.size()) * sizeof(DataType));
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(in(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data);
+    sycl_device.deallocate(gpu_data_out);
+  }
+  {
+    /* Assignement(out, Operator(out)) */
+    Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+    out = out.random() + DataType(0.01);
+    Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+    DataType *gpu_data_out = static_cast<DataType *>(
+        sycl_device.allocate(out.size() * sizeof(DataType)));
+    TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+                                   (out.size()) * sizeof(DataType));
+    auto device_expr = gpu_out.device(sycl_device);
+    asgn(device_expr, op(gpu_out));
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                   (out.size()) * sizeof(DataType));
+    for (int64_t i = 0; i < out.size(); ++i) {
+      DataType ver = reference(i);
+      asgn(ver, op(reference(i)));
+      VERIFY_IS_APPROX(out(i), ver);
+    }
+    sycl_device.deallocate(gpu_data_out);
+  }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC)                                 \
+  struct op_##FUNC {                                               \
+    template <typename T>                                          \
+    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+      return cl::sycl::FUNC(x);                                    \
+    }                                                              \
+    template <typename T>                                          \
+    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+      return x.FUNC();                                             \
+    }                                                              \
+  };
+
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
+
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+                                         const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+  test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+                                 op_##FUNC>(sycl_device, tensor_range)
+  RUN_UNARY_TEST(abs);
+  RUN_UNARY_TEST(sqrt);
+  RUN_UNARY_TEST(rsqrt);
+  RUN_UNARY_TEST(square);
+  RUN_UNARY_TEST(cube);
+  RUN_UNARY_TEST(inverse);
+  RUN_UNARY_TEST(tanh);
+  RUN_UNARY_TEST(exp);
+  RUN_UNARY_TEST(expm1);
+  RUN_UNARY_TEST(log);
+  RUN_UNARY_TEST(ceil);
+  RUN_UNARY_TEST(floor);
+  RUN_UNARY_TEST(round);
+  RUN_UNARY_TEST(log1p);
+  RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+  Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+  in = in.random() + DataType(0.01);
+  DataType *gpu_data = static_cast<DataType *>(
+      sycl_device.allocate(in.size() * sizeof(DataType)));
+  bool *gpu_data_out =
+      static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+  TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+                                 (in.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(bool));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_EQUAL(out(i), op(in(i)));
+  }
+  sycl_device.deallocate(gpu_data);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+                         const array<int64_t, 3>& tensor_range) {
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      PlusEqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_for_assignement<DataType, DataLayout,
+                                      EqualAssignement>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isnan>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isfinite>(sycl_device, tensor_range);
+  test_unary_builtins_return_bool<DataType, DataLayout,
+                                  op_isinf>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+
+  test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+                               const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, in_2) */
+  Operator op;
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random() + DataType(0.01);
+  in_2 = in_2.random() + DataType(0.01);
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_2 = static_cast<DataType *>(
+      sycl_device.allocate(in_2.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+                                 (in_2.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_2);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+                                     const array<int64_t, 3>& tensor_range) {
+  /* out = op(in_1, 2) */
+  Operator op;
+  const DataType arg2(2);
+  Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+  Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+  in_1 = in_1.random();
+  Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+  DataType *gpu_data_1 = static_cast<DataType *>(
+      sycl_device.allocate(in_1.size() * sizeof(DataType)));
+  DataType *gpu_data_out = static_cast<DataType *>(
+      sycl_device.allocate(out.size() * sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+  sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+                                 (in_1.size()) * sizeof(DataType));
+  gpu_out.device(sycl_device) = op(gpu_1, arg2);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+                                 (out.size()) * sizeof(DataType));
+  for (int64_t i = 0; i < out.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
+  }
+  sycl_device.deallocate(gpu_data_1);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+#define DECLARE_BINARY_STRUCT(FUNC)                                                          \
+  struct op_##FUNC {                                                                         \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) {            \
+      return cl::sycl::FUNC(x, y);                                                           \
+    }                                                                                        \
+    template <typename T1, typename T2>                                                      \
+    auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+      return x.FUNC(y);                                                                      \
+    }                                                                                        \
+  };
+
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
+
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR)                          \
+  struct op_##NAME {                                                      \
+    template <typename T1, typename T2>                                   \
+    auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+      return x OPERATOR y;                                                \
+    }                                                                     \
+  };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+                          const array<int64_t, 3>& tensor_range) {
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMax>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_cwiseMin>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_plus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_minus>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_times>(sycl_device, tensor_range);
+  test_binary_builtins_func<DataType, DataLayout,
+                            op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+  test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+  test_binary_builtins_fixed_arg2<DataType, RowMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+  test_binary_builtins_fixed_arg2<DataType, ColMajor,
+                                  op_modulo>(sycl_device, tensor_range);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(device);
+    Eigen::SyclDevice sycl_device(&queueInterface);
+    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index 88c233994..97923d15f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@@ -9,20 +9,17 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -75,8 +72,8 @@ void test_fallback_conversion() {
 }
 
 
-void test_cxx11_tensor_cast_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
 {
-  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_gpu_conversion());
   CALL_SUBTEST(test_fallback_conversion());
 }
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 3c6d0d2ff..45456f3ef 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -8,6 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "random_without_cast_overflow.h"
 
 #include <Eigen/CXX11/Tensor>
 
@@ -104,12 +105,82 @@ static void test_small_to_big_type_cast()
   }
 }
 
+template <typename FromType, typename ToType>
+static void test_type_cast() {
+  Tensor<FromType, 2> ftensor(100, 200);
+  // Generate random values for a valid cast.
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value();
+    }
+  }
+
+  Tensor<ToType, 2> ttensor(100, 200);
+  ttensor = ftensor.template cast<ToType>();
+
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 200; ++j) {
+      const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j));
+      VERIFY_IS_APPROX(ttensor(i, j), ref);
+    }
+  }
+}
+
+template<typename Scalar, typename EnableIf = void>
+struct test_cast_runner {
+  static void run() {
+    test_type_cast<Scalar, bool>();
+    test_type_cast<Scalar, int8_t>();
+    test_type_cast<Scalar, int16_t>();
+    test_type_cast<Scalar, int32_t>();
+    test_type_cast<Scalar, int64_t>();
+    test_type_cast<Scalar, uint8_t>();
+    test_type_cast<Scalar, uint16_t>();
+    test_type_cast<Scalar, uint32_t>();
+    test_type_cast<Scalar, uint64_t>();
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, float>();
+    test_type_cast<Scalar, double>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
+// Only certain types allow cast from std::complex<>.
+template<typename Scalar>
+struct test_cast_runner<Scalar, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+  static void run() {
+    test_type_cast<Scalar, half>();
+    test_type_cast<Scalar, bfloat16>();
+    test_type_cast<Scalar, std::complex<float>>();
+    test_type_cast<Scalar, std::complex<double>>();
+  }
+};
+
 
-void test_cxx11_tensor_casts()
+EIGEN_DECLARE_TEST(cxx11_tensor_casts)
 {
-   CALL_SUBTEST(test_simple_cast());
-   CALL_SUBTEST(test_vectorized_cast());
-   CALL_SUBTEST(test_float_to_int_cast());
-   CALL_SUBTEST(test_big_to_small_type_cast());
-   CALL_SUBTEST(test_small_to_big_type_cast());
+  CALL_SUBTEST(test_simple_cast());
+  CALL_SUBTEST(test_vectorized_cast());
+  CALL_SUBTEST(test_float_to_int_cast());
+  CALL_SUBTEST(test_big_to_small_type_cast());
+  CALL_SUBTEST(test_small_to_big_type_cast());
+
+  CALL_SUBTEST(test_cast_runner<bool>::run());
+  CALL_SUBTEST(test_cast_runner<int8_t>::run());
+  CALL_SUBTEST(test_cast_runner<int16_t>::run());
+  CALL_SUBTEST(test_cast_runner<int32_t>::run());
+  CALL_SUBTEST(test_cast_runner<int64_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint8_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint16_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint32_t>::run());
+  CALL_SUBTEST(test_cast_runner<uint64_t>::run());
+  CALL_SUBTEST(test_cast_runner<half>::run());
+  CALL_SUBTEST(test_cast_runner<bfloat16>::run());
+  CALL_SUBTEST(test_cast_runner<float>::run());
+  CALL_SUBTEST(test_cast_runner<double>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<float>>::run());
+  CALL_SUBTEST(test_cast_runner<std::complex<double>>::run());
+
 }
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index 1832dec8b..922274462 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -43,7 +43,7 @@ static void test_simple_chip()
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
   VERIFY_IS_EQUAL(chip2.dimension(3), 11);
   for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
         for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -75,7 +75,7 @@ static void test_simple_chip()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
         }
       }
@@ -126,7 +126,7 @@ static void test_dynamic_chip()
   VERIFY_IS_EQUAL(chip2.dimension(2), 7);
   VERIFY_IS_EQUAL(chip2.dimension(3), 11);
   for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+    for (int j = 0; j < 5; ++j) {
       for (int k = 0; k < 7; ++k) {
         for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -158,7 +158,7 @@ static void test_dynamic_chip()
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 3; ++j) {
       for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+        for (int l = 0; l < 11; ++l) {
           VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
         }
       }
@@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major()
   VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
 }
 
-void test_cxx11_tensor_chipping()
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping)
 {
   CALL_SUBTEST(test_simple_chip<ColMajor>());
   CALL_SUBTEST(test_simple_chip<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
new file mode 100644
index 000000000..1e7093104
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -0,0 +1,623 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+  tensor.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+  VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim2; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+  const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+  VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim3; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+  const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+  DataType* gpu_data_chip3  = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+  gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
+  sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+  VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim4; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+  const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+  DataType* gpu_data_chip4  = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+  gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
+  sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim5; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+        }
+      }
+    }
+  }
+
+
+  array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+  const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+  DataType* gpu_data_chip5  = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+  gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
+  sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+  VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_chip2);
+  sycl_device.deallocate(gpu_data_chip3);
+  sycl_device.deallocate(gpu_data_chip4);
+  sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+
+  Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
+  tensor.setRandom();
+  tensor1.setRandom();
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_chip1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+  DataType* gpu_data_tensor1  = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
+  gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
+  sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim3; ++j) {
+      for (int k = 0; k < sizeDim4; ++k) {
+        for (int l = 0; l < sizeDim5; ++l) {
+          float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
+  Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
+  tensor2.setRandom();
+  const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
+  DataType* gpu_data_tensor2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  DataType* gpu_data_chip2  = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
+  gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
+  sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+  for (int i = 0; i < sizeDim2; ++i) {
+    for (int j = 0; j < sizeDim4; ++j) {
+      for (int k = 0; k < sizeDim5; ++k) {
+        float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
+        VERIFY_IS_EQUAL(chip2(i,j,k), expected);
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_tensor1);
+  sycl_device.deallocate(gpu_data_chip1);
+  sycl_device.deallocate(gpu_data_tensor2);
+  sycl_device.deallocate(gpu_data_chip2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+  Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
+  input1.setRandom();
+  input2.setRandom();
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+  std::cout << tensorBuffSize << " , "<<  input2TensorBuffSize << std::endl;
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input1  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_input2  = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
+
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
+  gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
+  input3.setRandom();
+
+  const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
+  DataType* gpu_data_input3  = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
+  gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
+  input4.setRandom();
+
+  const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
+  DataType* gpu_data_input4  = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
+  gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+  Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
+  input5.setRandom();
+
+  const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
+  DataType* gpu_data_input5  = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
+  gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
+  input6.setRandom();
+
+  const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
+  DataType* gpu_data_input6  = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
+  gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  gpu_tensor.device(sycl_device)=gpu_input1;
+  Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
+  input7.setRandom();
+
+  DataType* gpu_data_input7  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
+  gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k <sizeDim3; ++k) {
+        for (int l = 0; l < sizeDim4; ++l) {
+          for (int m = 0; m < sizeDim5; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_input1);
+  sycl_device.deallocate(gpu_data_input2);
+  sycl_device.deallocate(gpu_data_input3);
+  sycl_device.deallocate(gpu_data_input4);
+  sycl_device.deallocate(gpu_data_input5);
+  sycl_device.deallocate(gpu_data_input6);
+  sycl_device.deallocate(gpu_data_input7);
+
+}
+
+template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
+  test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index b1ff8aecb..1a18e07cc 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -77,7 +77,7 @@ static void test_equality()
 }
 
 
-void test_cxx11_tensor_comparisons()
+EIGEN_DECLARE_TEST(cxx11_tensor_comparisons)
 {
   CALL_SUBTEST(test_orderings());
   CALL_SUBTEST(test_equality());
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index 2baf5eaad..99447b21d 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
@@ -8,12 +8,9 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -31,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
   cudaMalloc((void**)(&d_in2), complex_bytes);
   cudaMalloc((void**)(&d_out), complex_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -51,11 +48,13 @@ void test_cuda_complex_cwise_ops() {
     Add = 0,
     Sub,
     Mul,
-    Div
+    Div,
+    Neg,
+    NbOps
   };
 
   Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
-  for (int op = Add; op <= Div; op++) {
+  for (int op = Add; op < NbOps; op++) {
     std::complex<T> expected;
     switch (static_cast<CwiseOp>(op)) {
       case Add:
@@ -74,6 +73,12 @@ void test_cuda_complex_cwise_ops() {
         gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
         expected = a / b;
         break;
+      case Neg:
+        gpu_out.device(gpu_device) = -gpu_in1;
+        expected = -a;
+        break;
+      case NbOps:
+        break;
     }
     assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
                            gpu_device.stream()) == cudaSuccess);
@@ -90,7 +95,7 @@ void test_cuda_complex_cwise_ops() {
 }
 
 
-void test_cxx11_tensor_complex_cwise_ops()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
 {
   CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
   CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
index d4e111f5d..f8b8ae704 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu
@@ -8,12 +8,9 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -37,7 +34,7 @@ void test_cuda_nullary() {
   cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -73,7 +70,7 @@ void test_cuda_nullary() {
 
 static void test_cuda_sum_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -107,10 +104,45 @@ static void test_cuda_sum_reductions() {
   gpu_device.deallocate(gpu_out_ptr);
 }
 
+static void test_cuda_mean_reductions() {
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.mean();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.mean();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
 
 static void test_cuda_product_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -145,9 +177,10 @@ static void test_cuda_product_reductions() {
 }
 
 
-void test_cxx11_tensor_complex()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
 {
   CALL_SUBTEST(test_cuda_nullary());
   CALL_SUBTEST(test_cuda_sum_reductions());
+  CALL_SUBTEST(test_cuda_mean_reductions());
   CALL_SUBTEST(test_cuda_product_reductions());
 }
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index 03ef12e63..bb9418d33 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -50,7 +50,13 @@ static void test_static_dimension_failure()
       .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
       .concatenate(right, 0);
   Tensor<int, 2, DataLayout> alternative = left
-      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
+   // Clang compiler break with {{{}}} with an ambiguous error on copy constructor
+  // the variadic DSize constructor added for #ifndef EIGEN_EMULATE_CXX11_META_H.
+  // Solution:
+  // either the code should change to 
+  //  Tensor<int, 2>::Dimensions{{2, 3}}
+  // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}}
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0);
 }
 
 template<int DataLayout>
@@ -123,7 +129,7 @@ static void test_concatenation_as_lvalue()
 }
 
 
-void test_cxx11_tensor_concatenation()
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation)
 {
    CALL_SUBTEST(test_dimension_failures<ColMajor>());
    CALL_SUBTEST(test_dimension_failures<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
new file mode 100644
index 000000000..765991b35
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  IndexType leftDim3 = 1;
+  Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  IndexType rightDim3 = 1;
+  Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+
+  //IndexType concatDim1 = 3;
+//	IndexType concatDim2 = 3;
+//	IndexType concatDim3 = 1;
+  //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
+  left.setRandom();
+  right.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  ///
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+  DataType * gpu_out_data1 =  static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+
+  //concatenation = left.concatenate(right, 0);
+  gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
+  sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
+  for (IndexType j = 0; j < 3; ++j) {
+    for (IndexType i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  sycl_device.deallocate(gpu_out_data1);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+  DataType * gpu_out_data2 =  static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+  gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
+  sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
+    }
+    for (IndexType j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data2);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+  DataType * gpu_out_data3 =  static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+  gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
+  sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
+
+  //concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
+    }
+  }
+  sycl_device.deallocate(gpu_out_data3);
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+}
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
+
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
+
+  IndexType concatDim1 = 4;
+  IndexType concatDim2 = 3;
+  Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+  Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
+
+  left.setRandom();
+  right.setRandom();
+  result.setRandom();
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+
+//  t1.concatenate(t2, 0) = result;
+ gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
+ sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(left(i, j), result(i, j));
+      VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+  test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConcat_perDevice<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index ad9c9da39..9d806ee3c 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -55,7 +55,7 @@ static void test_assign_of_const_tensor()
 }
 
 
-void test_cxx11_tensor_const()
+EIGEN_DECLARE_TEST(cxx11_tensor_const)
 {
   CALL_SUBTEST(test_simple_assign());
   CALL_SUBTEST(test_assign_of_const_tensor());
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index dd68430ce..575bdc1f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -10,21 +10,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_gpu_contraction(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   // with these dimensions, the output has 300 * 140 elements, which is
@@ -47,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -68,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -81,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
@@ -111,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -131,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
   t_result = t_left.contract(t_right, dims);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
       !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
     std::cout << "mismatch detected: " << t_result()
@@ -139,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_left);
-  cudaFree((void*)d_t_right);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_left);
+  gpuFree((void*)d_t_right);
+  gpuFree((void*)d_t_result);
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_m() {
+void test_gpu_contraction_m() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(k, 128, 128);
-    test_cuda_contraction<RowMajor>(k, 128, 128);
+    test_gpu_contraction<ColMajor>(k, 128, 128);
+    test_gpu_contraction<RowMajor>(k, 128, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_k() {
+void test_gpu_contraction_k() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, k, 128);
-    test_cuda_contraction<RowMajor>(128, k, 128);
+    test_gpu_contraction<ColMajor>(128, k, 128);
+    test_gpu_contraction<RowMajor>(128, k, 128);
   }
 }
 
 template<int DataLayout>
-void test_cuda_contraction_n() {
+void test_gpu_contraction_n() {
   for (int k = 32; k < 256; k++) {
-    test_cuda_contraction<ColMajor>(128, 128, k);
-    test_cuda_contraction<RowMajor>(128, 128, k);
+    test_gpu_contraction<ColMajor>(128, 128, k);
+    test_gpu_contraction<RowMajor>(128, 128, k);
   }
 }
 
 
 template<int DataLayout>
-void test_cuda_contraction_sizes() {
+void test_gpu_contraction_sizes() {
   int m_sizes[] = { 31,  39,   63,   64,   65,
                    127, 129,  255,  257 , 511,
                    512, 513, 1023, 1024, 1025};
@@ -188,29 +187,32 @@ void test_cuda_contraction_sizes() {
   for (int i = 0; i < 15; i++) {
     for (int j = 0; j < 15; j++) {
       for (int k = 0; k < 17; k++) {
-        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+        test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
       }
     }
   }
 }
 
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
 
   CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
   CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
 
-  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
 
-  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
-  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+  CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
 
-  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
-  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+  CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
 
-  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
-  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+  CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif	
 }
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
new file mode 100644
index 000000000..fbcc29358
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -0,0 +1,1026 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <iostream>
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+                                                           128);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+                                                           128, k);
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+  IndexType m_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType n_sizes[] = {31,  39,  63,  64,  65,   127,  129, 255,
+                         257, 511, 512, 513, 1023, 1024, 1025};
+
+  IndexType k_sizes[] = {31,  39,  63,  64,  65,  95,   96,   127, 129,
+                         255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType, IndexType>(
+            sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+                                  IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Allocate buffers twice as big to check for invalid read and write
+  auto padded_left_size = 2 * t_left.size();
+  auto padded_right_size = 2 * t_right.size();
+  auto padded_result_size = 2 * t_result.size();
+
+  std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+  std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+  std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  // TensorMaps are still of the same size than the Tensors
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, result_dims);
+
+  // Write nan after the actual buffer to propagate nans everywhere in case of
+  // invalid reads
+  DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+  auto host_left_data = new DataType[padded_left_size];
+  std::copy_n(t_left.data(), t_left.size(), host_left_data);
+  std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+  auto host_right_data = new DataType[padded_right_size];
+  std::copy_n(t_right.data(), t_right.size(), host_right_data);
+  std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+  auto host_result_data = new DataType[padded_result_size];
+  std::fill_n(host_result_data, padded_result_size, nan);
+
+  sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+  sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - host_result_data[i]))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+                                  error_threshold)) {
+      continue;
+    }
+    if (std::isnan(host_result_data[i])) {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", invalid read detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    } else {
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType " << i << ": "
+                << t_result(i) << " vs " << host_result_data[i] << std::endl;
+    }
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  // Make sure that the rest of the result is still nans
+  for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+    if (std::isnan(host_result_data[i])) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", invalid write detected at IndexType " << i << ": "
+              << host_result_data[i] << std::endl;
+    VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+
+  delete[] host_left_data;
+  delete[] host_right_data;
+  delete[] host_result_data;
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+                 IndexType n_size) {
+  // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+  // ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 0, DataLayout, IndexType> t_result;
+  Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+      gpu_t_result(d_t_result);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+          t_result() - t_result_gpu()))) > error_threshold &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
+    std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+              << " : mismatch detected: " << t_result() << " vs "
+              << t_result_gpu() << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(), t_result());
+  }
+
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+                       IndexType k_size, IndexType n_size, IndexType m_batch,
+                       IndexType start, IndexType limit) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  typedef Eigen::array<IndexType, 3> TensorDim;
+  typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+  TensorDim left_dims = {{m_batch, k_size, m_size}};
+  TensorDim right_dims = {{m_batch, n_size, k_size}};
+  TensorDim res_dims = {{m_batch, m_size, n_size}};
+  Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
+
+  TensorType t_left(left_dims);
+  TensorType t_right(right_dims);
+  TensorType t_result_gpu(res_dims);
+  TensorType t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+  for (int i = start; i < limit; ++i) {
+    auto x = gpu_t_left.template chip<0>(i);
+    auto y = gpu_t_right.template chip<0>(i);
+    auto z = gpu_t_result.template chip<0>(i);
+    z.device(sycl_device) = x.contract(y, contract_pairs);
+  }
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  for (int i = start; i < limit; ++i) {
+    auto x = t_left.template chip<0>(i);
+    auto y = t_right.template chip<0>(i);
+    auto z = t_result.template chip<0>(i);
+    z = x.contract(y, contract_pairs);
+  }
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType j = 0; j < m_size; j++) {
+    for (IndexType i = 0; i < n_size; i++) {
+      if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+              t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+        continue;
+      }
+      if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+                                    error_threshold)) {
+        continue;
+      }
+      std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+                << ", mismatch detected at IndexType m: " << j << " n: " << i
+                << " CPU : " << t_result(j, i)
+                << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+      VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+    }
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+                                IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+          typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+                                 IndexType k_size, IndexType n_size) {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+      DimPair;
+  static const DataType error_threshold = DataType(1e-4);
+  Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+  Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+  Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType *d_t_left =
+      static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+  DataType *d_t_right =
+      static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+  DataType *d_t_result =
+      static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+      gpu_t_result(d_t_result, res_dims);
+
+  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+                                 t_result_bytes);
+
+  t_result = t_left.contract(t_right, dims);
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+              << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+              << " vs " << t_result_gpu(i) << std::endl;
+
+    VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+  }
+  sycl_device.deallocate(d_t_left);
+  sycl_device.deallocate(d_t_right);
+  sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Test out of bound for Tensor-Tensor
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       1024);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       4096);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+                                                       2048);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       1024);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+                                                       784);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+                                                       10);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+                                                       513);
+  test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+                                                       783);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+                                                       784);
+  test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+                                                       11);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor out of bound tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+                                                       128);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "tensor tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // VECTOR-VECTOR
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+                                                       1023);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "contracted tensor tests finished computation at "
+            << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Vector-Tensor
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+                                                       1025);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+                                                       1024);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+                                                       1023);
+
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+                                                       4097);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+                                                       4096);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+                                                       4095);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+                                                       32);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Matrix-Vector
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+                                                       1);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+                                                       1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+                                                       1);
+#endif
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // SCALAR Contraction
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+  test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+  // Tensor Tensor Contraction
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+                                                       257);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+                                                       256);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+                                                       16);
+  test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+                                                       17);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+                                                   0, 4);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+                                                            8);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+                                                            2048, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            10, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 1024);
+  contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            4096, 1024);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+                                                            16);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                            17);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                            32);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+                                                            64);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+                                                            1024, 1024);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+                                                            1024, 4096);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+                                                            1024, 2048);
+  contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+                                                            1024, 784);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+    const Dev &sycl_device) {
+  typedef float DataType;
+  typedef int64_t IndexType;
+  std::chrono::time_point<std::chrono::system_clock> start, end;
+  start = std::chrono::system_clock::now();
+
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+                                                             17);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+                                                             32);
+  contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+                                                             16, 64);
+  end = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end - start;
+  std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+  std::cout << "finished computation at " << std::ctime(&end_time)
+            << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
+  for (const auto &device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+    CALL_SUBTEST_2(tensorTensor(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+    CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+    CALL_SUBTEST_3(vectorVector(sycl_device));
+    CALL_SUBTEST_4(vectorTensor(sycl_device));
+    CALL_SUBTEST_5(tensorVector(sycl_device));
+    CALL_SUBTEST_6(tensorScalar(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+    CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+    CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+    CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+    CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index ace97057f..3b5c6a13c 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -471,7 +471,8 @@ static void test_tensor_product()
   mat1.setRandom();
   mat2.setRandom();
 
-  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+  Eigen::array<DimPair, 0> dims;
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, dims);
 
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 3);
@@ -510,36 +511,91 @@ static void test_const_inputs()
   VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
 }
 
-void test_cxx11_tensor_contraction()
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_large_contraction_with_output_kernel() {
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
 {
-  CALL_SUBTEST(test_evals<ColMajor>());
-  CALL_SUBTEST(test_evals<RowMajor>());
-  CALL_SUBTEST(test_scalar<ColMajor>());
-  CALL_SUBTEST(test_scalar<RowMajor>());
-  CALL_SUBTEST(test_multidims<ColMajor>());
-  CALL_SUBTEST(test_multidims<RowMajor>());
-  CALL_SUBTEST(test_holes<ColMajor>());
-  CALL_SUBTEST(test_holes<RowMajor>());
-  CALL_SUBTEST(test_full_redux<ColMajor>());
-  CALL_SUBTEST(test_full_redux<RowMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
-  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
-  CALL_SUBTEST(test_expr<ColMajor>());
-  CALL_SUBTEST(test_expr<RowMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
-  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
-  CALL_SUBTEST(test_consistency<ColMajor>());
-  CALL_SUBTEST(test_consistency<RowMajor>());
-  CALL_SUBTEST(test_large_contraction<ColMajor>());
-  CALL_SUBTEST(test_large_contraction<RowMajor>());
-  CALL_SUBTEST(test_matrix_vector<ColMajor>());
-  CALL_SUBTEST(test_matrix_vector<RowMajor>());
-  CALL_SUBTEST(test_tensor_vector<ColMajor>());
-  CALL_SUBTEST(test_tensor_vector<RowMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
-  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
-  CALL_SUBTEST(test_tensor_product<ColMajor>());
-  CALL_SUBTEST(test_tensor_product<RowMajor>());
-  CALL_SUBTEST(test_const_inputs<ColMajor>());
-  CALL_SUBTEST(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_1(test_evals<ColMajor>());
+  CALL_SUBTEST_1(test_evals<RowMajor>());
+  CALL_SUBTEST_1(test_scalar<ColMajor>());
+  CALL_SUBTEST_1(test_scalar<RowMajor>());
+  CALL_SUBTEST_2(test_multidims<ColMajor>());
+  CALL_SUBTEST_2(test_multidims<RowMajor>());
+  CALL_SUBTEST_2(test_holes<ColMajor>());
+  CALL_SUBTEST_2(test_holes<RowMajor>());
+  CALL_SUBTEST_3(test_full_redux<ColMajor>());
+  CALL_SUBTEST_3(test_full_redux<RowMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST_4(test_expr<ColMajor>());
+  CALL_SUBTEST_4(test_expr<RowMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_consistency<ColMajor>());
+  CALL_SUBTEST_5(test_consistency<RowMajor>());
+  CALL_SUBTEST_5(test_large_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_large_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST_6(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST_6(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST_7(test_tensor_product<ColMajor>());
+  CALL_SUBTEST_7(test_tensor_product<RowMajor>());
+  CALL_SUBTEST_8(test_const_inputs<ColMajor>());
+  CALL_SUBTEST_8(test_const_inputs<RowMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>());
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
 }
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index e3d4675eb..c3688f678 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -25,7 +25,8 @@ static void test_evals()
 
   Tensor<float, 2, DataLayout> result(2,3);
   result.setZero();
-  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3;
+  dims3[0] = 0;
 
   typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
   Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
@@ -136,7 +137,7 @@ static void test_strides() {
                                input(12)*kernel(2)));
 }
 
-void test_cxx11_tensor_convolution()
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution)
 {
   CALL_SUBTEST(test_evals<ColMajor>());
   CALL_SUBTEST(test_evals<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
new file mode 100644
index 000000000..3954c8a28
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=55;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 1> kernel_dims = {{4}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=51;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 2> dims3{{0,1}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=49;
+  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+  Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+  Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+  Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+  result_host.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
+      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
+        assert(false);
+      }
+    }
+  }
+}
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 1> kernel_dims = {{2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+  Eigen::array<IndexType, 1> dims3{{0}};
+
+  input.setRandom();
+  kernel.setRandom();
+  result.setZero();
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+  Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+  Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+  Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_valid =  static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<DataType, 1, DataLayout, IndexType> same(3);
+  std::size_t same_bytes = same.size() * sizeof(DataType);
+  DataType * d_same =  static_cast<DataType*>(sycl_device.allocate(same_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+  gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+
+  Tensor<DataType, 1, DataLayout, IndexType> full(5);
+  std::size_t full_bytes = full.size() * sizeof(DataType);
+  DataType * d_full =  static_cast<DataType*>(sycl_device.allocate(full_bytes));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+  gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+  sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+
+  sycl_device.deallocate(d_input);
+  sycl_device.deallocate(d_kernel);
+  sycl_device.deallocate(d_valid);
+  sycl_device.deallocate(d_same);
+  sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+  Eigen::array<IndexType, 1> input_dims = {{13}};
+  Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+  Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+  Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+  input.setRandom();
+  kernel.setRandom();
+  Eigen::array<IndexType, 1> dims;
+  dims[0] = 0;
+
+  Eigen::array<IndexType, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<IndexType, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  std::size_t input_bytes = input.size()  * sizeof(DataType);
+  std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+  std::size_t result_bytes = result.size() * sizeof(DataType);
+
+  DataType * d_input  = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+  DataType * d_kernel  = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+  DataType * d_result =  static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+  sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+  sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+  gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+  sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, RowMajor, int64_t>(sycl_device);
+  test_expr<float, ColMajor, int64_t>(sycl_device);
+  test_expr<float, RowMajor, int64_t>(sycl_device);
+  test_modes<float, ColMajor, int64_t>(sycl_device);
+  test_modes<float, RowMajor, int64_t>(sycl_device);
+  test_strides<float, ColMajor, int64_t>(sycl_device);
+  test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConvolutionPerDevice(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
index 4528cc176..b5dbc97bd 100644
--- a/unsupported/test/cxx11_tensor_custom_index.cpp
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -88,7 +88,7 @@ static void test_sizes_as_index()
 }
 
 
-void test_cxx11_tensor_custom_index() {
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) {
   test_map_as_index<ColMajor>();
   test_map_as_index<RowMajor>();
   test_matrix_as_index<ColMajor>();
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 8baa477cc..875ea57d2 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -104,7 +104,7 @@ static void test_custom_binary_op()
 }
 
 
-void test_cxx11_tensor_custom_op()
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op)
 {
   CALL_SUBTEST(test_custom_unary_op());
   CALL_SUBTEST(test_custom_binary_op());
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
new file mode 100644
index 000000000..d947ead83
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -0,0 +1,170 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 5;
+  Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(out.dimension(0), 6);
+  VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(out(i, j), 0);
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const TensorType& input1, const TensorType& input2,
+            Output& output, const Device& device) const
+  {
+    typedef typename TensorType::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int64_t i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+    }
+  }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+  Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+  Eigen::array<IndexType, 3> tensorResultRange  = {{2, 7, 5}};
+
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+  typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+  TensorType gpu_in1(gpu_in1_data, tensorRange1);
+  TensorType gpu_in2(gpu_in2_data, tensorRange2);
+  TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+  gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 5; ++i) {
+    typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+    TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(custom_op_perDevice<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index fde20ddf2..c9f78d2d3 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -9,16 +9,15 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 using Eigen::RowMajor;
 
@@ -68,22 +67,22 @@ struct CPUContext {
 // Context for evaluation on GPU
 struct GPUContext {
   GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
-    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
     float kernel_1d_val[] = {3.14f, 2.7f};
-    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
     float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
-    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
 
-    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
     float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
-    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+    assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
   }
   ~GPUContext() {
-    assert(cudaFree(kernel_1d_) == cudaSuccess);
-    assert(cudaFree(kernel_2d_) == cudaSuccess);
-    assert(cudaFree(kernel_3d_) == cudaSuccess);
+    assert(gpuFree(kernel_1d_) == gpuSuccess);
+    assert(gpuFree(kernel_2d_) == gpuSuccess);
+    assert(gpuFree(kernel_3d_) == gpuSuccess);
   }
 
   const Eigen::GpuDevice& device() const { return gpu_device_; }
@@ -104,7 +103,7 @@ struct GPUContext {
   float* kernel_2d_;
   float* kernel_3d_;
 
-  Eigen::CudaStreamDevice stream_;
+  Eigen::GpuStreamDevice stream_;
   Eigen::GpuDevice gpu_device_;
 };
 
@@ -283,12 +282,12 @@ void test_gpu() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@@ -296,7 +295,7 @@ void test_gpu() {
 
   GPUContext context(gpu_in1, gpu_in2, gpu_out);
   test_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -306,7 +305,7 @@ void test_gpu() {
   }
 
   test_forced_contextual_eval(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -316,7 +315,7 @@ void test_gpu() {
   }
 
   test_compound_assignment(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 50; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -326,7 +325,7 @@ void test_gpu() {
   }
 
   test_contraction(&context);
-  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 40; ++j) {
       const float result = out(i,j,0);
@@ -341,8 +340,8 @@ void test_gpu() {
   }
 
   test_1d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 70; ++k) {
@@ -352,8 +351,8 @@ void test_gpu() {
   }
 
   test_2d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 40; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -365,9 +364,13 @@ void test_gpu() {
     }
   }
 
+#if !defined(EIGEN_USE_HIP)
+// disable this test on the HIP platform
+// 3D tensor convolutions seem to hang on the HIP platform
+
   test_3d_convolution(&context);
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
   for (int i = 0; i < 39; ++i) {
     for (int j = 0; j < 49; ++j) {
       for (int k = 0; k < 69; ++k) {
@@ -380,10 +383,13 @@ void test_gpu() {
       }
     }
   }
+
+#endif
+ 
 }
 
 
-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
 {
   CALL_SUBTEST_1(test_cpu());
   CALL_SUBTEST_2(test_gpu());
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 7f79753c5..5095cb078 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -13,19 +13,65 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <stdint.h>
+#include <iostream>
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_memory(const Eigen::SyclDevice &sycl_device) {
+  std::cout << "Running on : "
+            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
+            <<std::endl;
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
+  memset(in1.data(), 1, in1.size() * sizeof(DataType));
+  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
+  for (IndexType i=0; i<in.size(); i++) {
+    VERIFY_IS_EQUAL(in(i), in1(i));
+  }
+  sycl_device.deallocate(gpu_in_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
+  VERIFY(sycl_device.ok());
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorDims = {{sizeDim1}};
+  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
+  sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
 
-void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
-  std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
-    << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
+  out.device(sycl_device) = in / in.constant(0);
+
+  sycl_device.synchronize();
+  VERIFY(!sycl_device.ok());
+  sycl_device.deallocate(gpu_data);
+}
+
+template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
+  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+  test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, RowMajor>(sycl_device);
+  /// this test throw an exception. enable it if you want to see the exception
+  //test_device_exceptions<DataType, ColMajor>(sycl_device);
 }
-void test_cxx11_tensor_device_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_device_sycl(sycl_device));
+
+EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 16f168ed4..ee416e14a 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -60,10 +60,29 @@ static void test_rank_zero()
   VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
 }
 
-void test_cxx11_tensor_dimension()
+static void test_index_type_promotion() {
+  Eigen::DSizes<int, 3> src0(1, 2, 3);
+  Eigen::array<int, 3> src1;
+  src1[0] = 4;
+  src1[1] = 5;
+  src1[2] = 6;
+
+  Eigen::DSizes<long, 3> dst0(src0);
+  Eigen::DSizes<long, 3> dst1(src1);
+
+  VERIFY_IS_EQUAL(dst0[0], 1L);
+  VERIFY_IS_EQUAL(dst0[1], 2L);
+  VERIFY_IS_EQUAL(dst0[2], 3L);
+  VERIFY_IS_EQUAL(dst1[0], 4L);
+  VERIFY_IS_EQUAL(dst1[1], 5L);
+  VERIFY_IS_EQUAL(dst1[2], 6L);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_dimension)
 {
   CALL_SUBTEST(test_dynamic_size());
   CALL_SUBTEST(test_fixed_size());
   CALL_SUBTEST(test_match());
   CALL_SUBTEST(test_rank_zero());
+  CALL_SUBTEST(test_index_type_promotion());
 }
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
index d7eea42d7..fd889c46c 100644
--- a/unsupported/test/cxx11_tensor_empty.cpp
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor()
 }
 
 
-void test_cxx11_tensor_empty()
+EIGEN_DECLARE_TEST(cxx11_tensor_empty)
 {
    CALL_SUBTEST(test_empty_tensor());
    CALL_SUBTEST(test_empty_fixed_size_tensor());
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 000000000..66b06e8ee
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled evaluation.
+
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor =
+      Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+                                      /*Vectorizable=*/false,
+                                      /*Tiling=*/TiledEvaluation::Off>;
+
+  Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+          typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+  using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+  using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+                                                   Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+}
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_rvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Layout, Index> src(dims);
+  src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM)                                           \
+  if (NumDims > (CHIP_DIM)) {                                             \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+    const auto expr = src.template chip<(CHIP_DIM)>(offset);              \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> golden;                         \
+    golden = expr;                                                        \
+                                                                          \
+    Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions());       \
+                                                                          \
+    using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;   \
+    using Executor = internal::TensorExecutor<const Assign, Device,       \
+                                              Vectorizable, Tiling>;      \
+                                                                          \
+    Executor::run(Assign(dst, expr), d);                                  \
+                                                                          \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {            \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                     \
+    }                                                                     \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_lvalue(Device d)
+{
+  auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM)                                             \
+  if (NumDims > (CHIP_DIM)) {                                               \
+    /* Generate random data that we'll assign to the chipped tensor dim. */ \
+    array<Index, NumDims - 1> src_dims;                                     \
+    for (int i = 0; i < NumDims - 1; ++i) {                                 \
+      int dim = i < (CHIP_DIM) ? i : i + 1;                                 \
+      src_dims[i] = dims[dim];                                              \
+    }                                                                       \
+                                                                            \
+    Tensor<T, NumDims - 1, Layout, Index> src(src_dims);                    \
+    src.setRandom();                                                        \
+                                                                            \
+    const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1);   \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> random(dims);                         \
+    random.setZero();                                                       \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> golden(dims);                         \
+    golden = random;                                                        \
+    golden.template chip<(CHIP_DIM)>(offset) = src;                         \
+                                                                            \
+    Tensor<T, NumDims, Layout, Index> dst(dims);                            \
+    dst = random;                                                           \
+    auto expr = dst.template chip<(CHIP_DIM)>(offset);                      \
+                                                                            \
+    using Assign = TensorAssignOp<decltype(expr), const decltype(src)>;     \
+    using Executor = internal::TensorExecutor<const Assign, Device,         \
+                                              Vectorizable, Tiling>;        \
+                                                                            \
+    Executor::run(Assign(expr, src), d);                                    \
+                                                                            \
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {              \
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));                       \
+    }                                                                       \
+  }
+
+  TEST_CHIPPING(0)
+  TEST_CHIPPING(1)
+  TEST_CHIPPING(2)
+  TEST_CHIPPING(3)
+  TEST_CHIPPING(4)
+  TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) {
+      shuffled_dims[i] = dims[shuffle[i]];
+    }
+
+    const auto expr = src.shuffle(shuffle);
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    DefaultAssign(golden, expr);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_lvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  DSizes<Index, NumDims> shuffle;
+  for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+  // Test all possible shuffle permutations.
+  do {
+    DSizes<Index, NumDims> shuffled_dims;
+    for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+    // We assume that shuffling on a default device is tested and correct, so
+    // we can rely on it to verify correctness of tensor executor and tiling.
+    Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+    auto golden_shuffle = golden.shuffle(shuffle);
+    DefaultAssign(golden_shuffle, src);
+
+    // Now do the shuffling using configured tensor executor.
+    Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+    auto dst_shuffle = dst.shuffle(shuffle);
+    DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
+
+    for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+      VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+    }
+
+  } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reshape(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+  static constexpr int ReshapedDims = NumDims - 1;
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Multiple 0th dimension and then shuffle.
+  std::vector<Index> shuffle;
+  for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+  std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+  DSizes<Index, ReshapedDims> reshaped_dims;
+  reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+  for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+  Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.reshape(reshaped_dims);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> golden =
+      src.slice(slice_start, slice_size);
+
+  // Now reshape using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  auto expr = src.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+  static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(5, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+  auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < NumDims; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  Tensor<T, NumDims, Options, Index> slice(slice_size);
+  slice.setRandom();
+
+  // Assign a slice using default executor.
+  Tensor<T, NumDims, Options, Index> golden = src;
+  golden.slice(slice_start, slice_size) = slice;
+
+  // And using configured execution strategy.
+  Tensor<T, NumDims, Options, Index> dst = src;
+  auto expr = dst.slice(slice_start, slice_size);
+
+  using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(expr, slice), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, 10);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto broadcasts = RandomDims<NumDims>(1, 7);
+  const auto expr = src.square().eval().broadcast(broadcasts);
+
+  // We assume that broadcasting on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+      internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template<typename T, int NumDims>
+struct DummyGenerator {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  T operator()(const array <Index, NumDims>& dims) const {
+    T result = static_cast<T>(0);
+    for (int i = 0; i < NumDims; ++i) {
+      result += static_cast<T>((i + 1) * dims[i]);
+    }
+    return result;
+  }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_generator_op(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(20, 30);
+  Tensor<T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  const auto expr = src.generate(DummyGenerator<T, NumDims>());
+
+  // We assume that generator on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor<T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the broadcasting using configured tensor executor.
+  Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+    TiledEvaluation Tiling, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+  Tensor <T, NumDims, Options, Index> src(dims);
+  src.setRandom();
+
+  // Reverse half of the dimensions.
+  Eigen::array<bool, NumDims> reverse;
+  for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+  const auto expr = src.reverse(reverse);
+
+  // We assume that reversing on a default device is tested and correct, so
+  // we can rely on it to verify correctness of tensor executor and tiling.
+  Tensor <T, NumDims, Options, Index> golden;
+  golden = expr;
+
+  // Now do the reversing using configured tensor executor.
+  Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using Executor =
+    internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+  Executor::run(Assign(dst, expr), d);
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> src(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  src.setRandom();
+  const auto expr = src.square();
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T square = src.coeff(i) * src.coeff(i);
+    VERIFY_IS_EQUAL(square, dst.coeff(i));
+  }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+          TiledEvaluation Tiling, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+  static constexpr int Options = 0 | Layout;
+
+  // Pick a large enough tensor size to bypass small tensor block evaluation
+  // optimization.
+  auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+  Tensor<T, NumDims, Options, Index> lhs(dims);
+  Tensor<T, NumDims, Options, Index> rhs(dims);
+  Tensor<T, NumDims, Options, Index> dst(dims);
+
+  lhs.setRandom();
+  rhs.setRandom();
+
+  const auto expr = lhs + rhs;
+
+  Eigen::Barrier done(1);
+  auto on_done = [&done]() { done.Notify(); };
+
+  using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+  using DoneCallback = decltype(on_done);
+  using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+                                                 Vectorizable, Tiling>;
+
+  Executor::runAsync(Assign(dst, expr), d, on_done);
+  done.Wait();
+
+  for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+    T sum = lhs.coeff(i) + rhs.coeff(i);
+    VERIFY_IS_EQUAL(sum, dst.coeff(i));
+  }
+}
+
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                                 \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    false,               TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(default_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice,    VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(default_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));          \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device)));      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS)                                                                      \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     ColMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  ColMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false,               TiledEvaluation::On,  RowMajor>(tp_device)));     \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::Off,     RowMajor>(tp_device))); \
+  CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true),  TiledEvaluation::On,  RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+  Eigen::DefaultDevice default_device;
+  // Default device is unused in ASYNC tests.
+  EIGEN_UNUSED_VARIABLE(default_device);
+
+  const auto num_threads = internal::random<int>(20, 24);
+  Eigen::ThreadPool tp(num_threads);
+  Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+  CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+  CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+  CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+  CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+  CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+  CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+  CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
+}
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 77e24cb67..169fc1898 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -7,6 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <numeric>
+
 #include "main.h"
 
 #include <Eigen/CXX11/Tensor>
@@ -193,26 +195,23 @@ static void test_constants()
 
 static void test_boolean()
 {
-  Tensor<int, 1> vec(6);
-  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+  const int kSize = 31;
+  Tensor<int, 1> vec(kSize);
+  std::iota(vec.data(), vec.data() + kSize, 0);
 
   // Test ||.
   Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
-  VERIFY_IS_EQUAL(bool1[0], true);
-  VERIFY_IS_EQUAL(bool1[1], false);
-  VERIFY_IS_EQUAL(bool1[2], false);
-  VERIFY_IS_EQUAL(bool1[3], false);
-  VERIFY_IS_EQUAL(bool1[4], false);
-  VERIFY_IS_EQUAL(bool1[5], true);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = i < 1 || i > 4;
+    VERIFY_IS_EQUAL(bool1[i], expected);
+  }
 
   // Test &&, including cast of operand vec.
   Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
-  VERIFY_IS_EQUAL(bool2[0], false);
-  VERIFY_IS_EQUAL(bool2[1], true);
-  VERIFY_IS_EQUAL(bool2[2], true);
-  VERIFY_IS_EQUAL(bool2[3], true);
-  VERIFY_IS_EQUAL(bool2[4], false);
-  VERIFY_IS_EQUAL(bool2[5], false);
+  for (int i = 0; i < kSize; ++i) {
+    bool expected = bool(i) && i < 4;
+    VERIFY_IS_EQUAL(bool2[i], expected);
+  }
 
   // Compilation tests:
   // Test Tensor<bool> against results of cast or comparison; verifies that
@@ -300,8 +299,152 @@ static void test_select()
   }
 }
 
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+  for (int size = 1; size < 17; ++size) {
+    const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN();
+    const Scalar kInf = std::numeric_limits<Scalar>::infinity();
+    const Scalar kZero(0);
+    Tensor<Scalar, 1> vec_all_nan(size);
+    Tensor<Scalar, 1> vec_one_nan(size);
+    Tensor<Scalar, 1> vec_zero(size);
+    vec_all_nan.setConstant(kNaN);
+    vec_zero.setZero();
+    vec_one_nan.setZero();
+    vec_one_nan(size/2) = kNaN;
+
+    auto verify_all_nan = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY((numext::isnan)(v(i)));
+      }
+    };
+
+    auto verify_all_zero = [&](const Tensor<Scalar, 1>& v) {
+      for (int i = 0; i < size; ++i) {
+        VERIFY_IS_EQUAL(v(i), Scalar(0));
+      }
+    };
+
+    // Test NaN propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = nan
+    // max(0, nan) = nan
+    // max(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero));
+
+    // Test number propagating max.
+    // max(nan, nan) = nan
+    // max(nan, 0) = 0
+    // max(0, nan) = 0
+    // max(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero));
+
+    // Test NaN propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = nan
+    // min(0, nan) = nan
+    // min(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero));
+
+    // Test number propagating min.
+    // min(nan, nan) = nan
+    // min(nan, 0) = 0
+    // min(0, nan) = 0
+    // min(0, 0) = 0
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_zero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero));
+
+    // Test min and max reduction
+    Tensor<Scalar, 0> val;
+    val = vec_zero.minimum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.maximum();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNaN>();
+    VERIFY_IS_EQUAL(val(), kZero);
+    val = vec_zero.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kZero);
+
+    // Test NaN propagation for tensor of all NaNs.
+    val = vec_all_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_all_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), kInf);
+    val = vec_all_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_all_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), -kInf);
+
+    // Test NaN propagation for tensor with a single NaN.
+    val = vec_one_nan.template minimum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template minimum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? kInf : kZero));
+    val = vec_one_nan.template maximum<PropagateNaN>();
+    VERIFY((numext::isnan)(val()));
+    val = vec_one_nan.template maximum<PropagateNumbers>();
+    VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero));
+  }
+}
+
+static void test_clip()
+{
+  Tensor<float, 1> vec(6);
+  vec(0) = 4.0;
+  vec(1) = 8.0;
+  vec(2) = 15.0;
+  vec(3) = 16.0;
+  vec(4) = 23.0;
+  vec(5) = 42.0;
+
+  float kMin = 20;
+  float kMax = 30;
+
+  Tensor<float, 1> vec_clipped(6);
+  vec_clipped = vec.clip(kMin, kMax);
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax));
+  }
+}
+
+static void test_minmax_nan_propagation()
+{
+  test_minmax_nan_propagation_templ<float>();
+  test_minmax_nan_propagation_templ<double>();
+}
 
-void test_cxx11_tensor_expr()
+EIGEN_DECLARE_TEST(cxx11_tensor_expr)
 {
   CALL_SUBTEST(test_1d());
   CALL_SUBTEST(test_2d());
@@ -311,4 +454,11 @@ void test_cxx11_tensor_expr()
   CALL_SUBTEST(test_functors());
   CALL_SUBTEST(test_type_casting());
   CALL_SUBTEST(test_select());
+  CALL_SUBTEST(test_clip());
+
+// Nan propagation does currently not work like one would expect from std::max/std::min,
+// so we disable it for now
+#if !EIGEN_ARCH_ARM_OR_ARM64
+  CALL_SUBTEST(test_minmax_nan_propagation());
+#endif
 }
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 2f14ebc62..2e1008eca 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -224,7 +224,35 @@ static void test_fft_real_input_energy() {
   }
 }
 
-void test_cxx11_tensor_fft() {
+template <typename RealScalar>
+static void test_fft_non_power_of_2_round_trip(int exponent) {
+  int n = (1 << exponent) + 1;
+
+  Eigen::DSizes<ptrdiff_t, 1> dimensions;
+  dimensions[0] = n;
+  const DSizes<ptrdiff_t, 1> arr = dimensions;
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> input;
+
+  input.resize(arr);
+  input.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<RealScalar>, 1, ColMajor> forward =
+      input.template fft<BothParts, FFT_FORWARD>(fft);
+
+  Tensor<RealScalar, 1, ColMajor, ptrdiff_t> output =
+      forward.template fft<RealPart, FFT_REVERSE>(fft);
+
+  for (int i = 0; i < n; ++i) {
+    RealScalar tol = test_precision<RealScalar>() *
+                     (std::abs(input[i]) + std::abs(output[i]) + 1);
+    VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_fft) {
     test_fft_complex_input_golden();
     test_fft_real_input_golden();
 
@@ -270,4 +298,7 @@ void test_cxx11_tensor_fft() {
     test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
     test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
     test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_non_power_of_2_round_trip<float>(7);
+    test_fft_non_power_of_2_round_trip<double>(7);
 }
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 4c660de65..456ce6bea 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -21,7 +21,7 @@ static void test_0d()
   TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
   VERIFY_IS_EQUAL(scalar1.rank(), 0);
   VERIFY_IS_EQUAL(scalar1.size(), 1);
-  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
+  VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1);
 
   scalar1() = 7.0;
   scalar2() = 13.0;
@@ -250,7 +250,7 @@ static void test_array()
   }
 }
 
-void test_cxx11_tensor_fixed_size()
+EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index 45d7345e9..a21a02bec 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -61,7 +61,7 @@ static void test_const()
   Eigen::array<int, 2> bcast;
   bcast[0] = 3;
   bcast[1] = 1;
-  const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
+  const TensorMap<const Tensor<float, 2> > input_tensor(input.data(), 3, 3);
   Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
 
   for (int i = 0; i < 3; ++i) {
@@ -72,7 +72,7 @@ static void test_const()
 }
 
 
-void test_cxx11_tensor_forced_eval()
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval)
 {
   CALL_SUBTEST(test_simple());
   CALL_SUBTEST(test_const());
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 5690da723..a55a5ad8a 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -13,44 +13,44 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
-
+template <typename DataType, int DataLayout, typename IndexType>
 void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 
-  int sizeDim1 = 100;
-  int sizeDim2 = 200;
-  int sizeDim3 = 200;
-  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Eigen::Tensor<float, 3> in1(tensorRange);
-  Eigen::Tensor<float, 3> in2(tensorRange);
-  Eigen::Tensor<float, 3> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 20;
+  IndexType sizeDim3 = 20;
+  Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
 
-  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
 
-  in1 = in1.random() + in1.constant(10.0f);
-  in2 = in2.random() + in2.constant(10.0f);
+  in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+  in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
 
   // creating TensorMap from tensor
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
-  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
   /// c=(a+b)*b
   gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i, j, k),
                          (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
       }
@@ -63,8 +63,15 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 
 }
 
-void test_cxx11_tensor_forced_eval_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
+template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+    CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
index dcb928714..6dcf676bb 100644
--- a/unsupported/test/cxx11_tensor_generator.cpp
+++ b/unsupported/test/cxx11_tensor_generator.cpp
@@ -42,11 +42,11 @@ struct Generator2D {
 template <int DataLayout>
 static void test_2D()
 {
-  Tensor<float, 2> matrix(5, 7);
+  Tensor<float, 2> matrix(512, 512);
   Tensor<float, 2> result = matrix.generate(Generator2D());
 
-  for (int i = 0; i < 5; ++i) {
-    for (int j = 0; j < 5; ++j) {
+  for (int i = 0; i < 512; ++i) {
+    for (int j = 0; j < 512; ++j) {
       VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
     }
   }
@@ -80,7 +80,7 @@ static void test_gaussian()
 }
 
 
-void test_cxx11_tensor_generator()
+EIGEN_DECLARE_TEST(cxx11_tensor_generator)
 {
   CALL_SUBTEST(test_1D<ColMajor>());
   CALL_SUBTEST(test_1D<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
new file mode 100644
index 000000000..fb6e3d9d0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+static const float error_threshold =1e-8f;
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_1D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 6;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =vec.size()*sizeof(DataType);
+  DataType* gpu_data_vec  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_2D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 7;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D());
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < 5; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType rows = 32;
+  IndexType cols = 48;
+  array<DataType, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<DataType, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  array<IndexType, 2> tensorRange = {{rows, cols}};
+  Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+  Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+  const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+  DataType* gpu_data_matrix  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_result  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+  gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+  for (IndexType i = 0; i < rows; ++i) {
+    for (IndexType j = 0; j < cols; ++j) {
+      DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      DataType gaussian = expf(-g_rows - g_cols);
+      Eigen::internal::isApprox(result(i, j), gaussian, error_threshold);
+    }
+  }
+}
+
+template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 0ba9d52e9..137d0d596 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -9,18 +9,19 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#define EIGEN_GPU_TEST_C99_MATH  EIGEN_HAS_CXX11
+
 using Eigen::Tensor;
 
-void test_cuda_nullary() {
+void test_gpu_nullary() {
   Tensor<float, 1, 0, int> in1(2);
   Tensor<float, 1, 0, int> in2(2);
   in1.setRandom();
@@ -30,12 +31,12 @@ void test_cuda_nullary() {
 
   float* d_in1;
   float* d_in2;
-  cudaMalloc((void**)(&d_in1), tensor_bytes);
-  cudaMalloc((void**)(&d_in2), tensor_bytes);
-  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  gpuMalloc((void**)(&d_in1), tensor_bytes);
+  gpuMalloc((void**)(&d_in2), tensor_bytes);
+  gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -49,23 +50,23 @@ void test_cuda_nullary() {
   Tensor<float, 1, 0, int> new1(2);
   Tensor<float, 1, 0, int> new2(2);
 
-  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
 
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(new1(i), 3.14f);
     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
 }
 
-void test_cuda_elementwise_small() {
+void test_gpu_elementwise_small() {
   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -79,14 +80,14 @@ void test_cuda_elementwise_small() {
   float* d_in1;
   float* d_in2;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -98,9 +99,9 @@ void test_cuda_elementwise_small() {
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
@@ -108,12 +109,12 @@ void test_cuda_elementwise_small() {
         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_out);
 }
 
-void test_cuda_elementwise()
+void test_gpu_elementwise()
 {
   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -132,16 +133,16 @@ void test_cuda_elementwise()
   float* d_in2;
   float* d_in3;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_in2), in2_bytes);
-  cudaMalloc((void**)(&d_in3), in3_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_in2), in2_bytes);
+  gpuMalloc((void**)(&d_in3), in3_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -151,8 +152,8 @@ void test_cuda_elementwise()
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
@@ -162,13 +163,13 @@ void test_cuda_elementwise()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_in2);
-  cudaFree(d_in3);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_in2);
+  gpuFree(d_in3);
+  gpuFree(d_out);
 }
 
-void test_cuda_props() {
+void test_gpu_props() {
   Tensor<float, 1> in1(200);
   Tensor<bool, 1> out(200);
   in1.setRandom();
@@ -178,12 +179,12 @@ void test_cuda_props() {
 
   float* d_in1;
   bool* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -193,19 +194,19 @@ void test_cuda_props() {
 
   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
-                         gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 200; ++i) {
     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
-void test_cuda_reduction()
+void test_gpu_reduction()
 {
   Tensor<float, 4> in1(72,53,97,113);
   Tensor<float, 2> out(72,97);
@@ -216,12 +217,12 @@ void test_cuda_reduction()
 
   float* d_in1;
   float* d_out;
-  cudaMalloc((void**)(&d_in1), in1_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_in1), in1_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -233,8 +234,8 @@ void test_cuda_reduction()
 
   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -249,12 +250,12 @@ void test_cuda_reduction()
     }
   }
 
-  cudaFree(d_in1);
-  cudaFree(d_out);
+  gpuFree(d_in1);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_contraction()
+void test_gpu_contraction()
 {
   // with these dimensions, the output has 300 * 140 elements, which is
   // more than 30 * 1024, which is the number of threads in blocks on
@@ -274,14 +275,14 @@ void test_cuda_contraction()
   float* d_t_right;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_left), t_left_bytes);
-  cudaMalloc((void**)(&d_t_right), t_right_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_left), t_left_bytes);
+  gpuMalloc((void**)(&d_t_right), t_right_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -301,7 +302,7 @@ void test_cuda_contraction()
   m_result = m_left * m_right;
   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
 
-  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
 
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
@@ -310,13 +311,13 @@ void test_cuda_contraction()
     }
   }
 
-  cudaFree(d_t_left);
-  cudaFree(d_t_right);
-  cudaFree(d_t_result);
+  gpuFree(d_t_left);
+  gpuFree(d_t_right);
+  gpuFree(d_t_result);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_1d()
+void test_gpu_convolution_1d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 1, DataLayout> kernel(4);
@@ -331,14 +332,14 @@ void test_cuda_convolution_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -348,8 +349,8 @@ void test_cuda_convolution_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 34; ++j) {
@@ -364,12 +365,12 @@ void test_cuda_convolution_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_col_major_1d()
+void test_gpu_convolution_inner_dim_col_major_1d()
 {
   Tensor<float, 4, ColMajor> input(74,9,11,7);
   Tensor<float, 1, ColMajor> kernel(4);
@@ -384,14 +385,14 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -401,8 +402,8 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 71; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -417,12 +418,12 @@ void test_cuda_convolution_inner_dim_col_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
-void test_cuda_convolution_inner_dim_row_major_1d()
+void test_gpu_convolution_inner_dim_row_major_1d()
 {
   Tensor<float, 4, RowMajor> input(7,9,11,74);
   Tensor<float, 1, RowMajor> kernel(4);
@@ -437,14 +438,14 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -454,8 +455,8 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     for (int j = 0; j < 9; ++j) {
@@ -470,13 +471,13 @@ void test_cuda_convolution_inner_dim_row_major_1d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_2d()
+void test_gpu_convolution_2d()
 {
   Tensor<float, 4, DataLayout> input(74,37,11,137);
   Tensor<float, 2, DataLayout> kernel(3,4);
@@ -491,14 +492,14 @@ void test_cuda_convolution_2d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -508,8 +509,8 @@ void test_cuda_convolution_2d()
   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -534,13 +535,13 @@ void test_cuda_convolution_2d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 template<int DataLayout>
-void test_cuda_convolution_3d()
+void test_gpu_convolution_3d()
 {
   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -555,14 +556,14 @@ void test_cuda_convolution_3d()
   float* d_input;
   float* d_kernel;
   float* d_out;
-  cudaMalloc((void**)(&d_input), input_bytes);
-  cudaMalloc((void**)(&d_kernel), kernel_bytes);
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_input), input_bytes);
+  gpuMalloc((void**)(&d_kernel), kernel_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;    
+  Eigen::GpuStreamDevice stream;    
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -572,8 +573,8 @@ void test_cuda_convolution_3d()
   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 74; ++i) {
     for (int j = 0; j < 35; ++j) {
@@ -612,14 +613,15 @@ void test_cuda_convolution_3d()
     }
   }
 
-  cudaFree(d_input);
-  cudaFree(d_kernel);
-  cudaFree(d_out);
+  gpuFree(d_input);
+  gpuFree(d_kernel);
+  gpuFree(d_out);
 }
 
 
+#if EIGEN_GPU_TEST_C99_MATH
 template <typename Scalar>
-void test_cuda_lgamma(const Scalar stddev)
+void test_gpu_lgamma(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -631,12 +633,12 @@ void test_cuda_lgamma(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -644,8 +646,8 @@ void test_cuda_lgamma(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.lgamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -653,12 +655,13 @@ void test_cuda_lgamma(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
+#endif
 
 template <typename Scalar>
-void test_cuda_digamma()
+void test_gpu_digamma()
 {
   Tensor<Scalar, 1> in(7);
   Tensor<Scalar, 1> out(7);
@@ -685,12 +688,12 @@ void test_cuda_digamma()
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
@@ -698,8 +701,8 @@ void test_cuda_digamma()
 
   gpu_out.device(gpu_device) = gpu_in.digamma();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 5; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
@@ -708,12 +711,12 @@ void test_cuda_digamma()
     VERIFY_IS_EQUAL(out(i), expected_out(i));
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_zeta()
+void test_gpu_zeta()
 {
   Tensor<Scalar, 1> in_x(6);
   Tensor<Scalar, 1> in_q(6);
@@ -747,14 +750,14 @@ void test_cuda_zeta()
   Scalar* d_in_x;
   Scalar* d_in_q;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_q), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_q), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
@@ -763,8 +766,8 @@ void test_cuda_zeta()
 
   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   VERIFY_IS_EQUAL(out(0), expected_out(0));
   VERIFY((std::isnan)(out(3)));
@@ -775,13 +778,13 @@ void test_cuda_zeta()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_q);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_q);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_polygamma()
+void test_gpu_polygamma()
 {
   Tensor<Scalar, 1> in_x(7);
   Tensor<Scalar, 1> in_n(7);
@@ -818,14 +821,14 @@ void test_cuda_polygamma()
   Scalar* d_in_x;
   Scalar* d_in_n;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_n), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_n), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
   
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
@@ -834,20 +837,20 @@ void test_cuda_polygamma()
 
   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 7; ++i) {
     VERIFY_IS_APPROX(out(i), expected_out(i));
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_n);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_n);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igamma()
+void test_gpu_igamma()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -883,14 +886,14 @@ void test_cuda_igamma()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -899,8 +902,8 @@ void test_cuda_igamma()
 
   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -912,13 +915,13 @@ void test_cuda_igamma()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_igammac()
+void test_gpu_igammac()
 {
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
@@ -953,14 +956,14 @@ void test_cuda_igammac()
   Scalar* d_a;
   Scalar* d_x;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_a), bytes);
-  cudaMalloc((void**)(&d_x), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -969,8 +972,8 @@ void test_cuda_igammac()
 
   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
@@ -982,13 +985,14 @@ void test_cuda_igammac()
     }
   }
 
-  cudaFree(d_a);
-  cudaFree(d_x);
-  cudaFree(d_out);
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
 }
 
+#if EIGEN_GPU_TEST_C99_MATH
 template <typename Scalar>
-void test_cuda_erf(const Scalar stddev)
+void test_gpu_erf(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -1000,12 +1004,12 @@ void test_cuda_erf(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
-  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+  assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
+  assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1013,8 +1017,8 @@ void test_cuda_erf(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erf();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1022,12 +1026,12 @@ void test_cuda_erf(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_erfc(const Scalar stddev)
+void test_gpu_erfc(const Scalar stddev)
 {
   Tensor<Scalar, 2> in(72,97);
   in.setRandom();
@@ -1039,12 +1043,12 @@ void test_cuda_erfc(const Scalar stddev)
 
   Scalar* d_in;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1052,8 +1056,8 @@ void test_cuda_erfc(const Scalar stddev)
 
   gpu_out.device(gpu_device) = gpu_in.erfc();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 97; ++j) {
@@ -1061,12 +1065,73 @@ void test_cuda_erfc(const Scalar stddev)
     }
   }
 
-  cudaFree(d_in);
-  cudaFree(d_out);
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+#endif
+template <typename Scalar>
+void test_gpu_ndtri()
+{
+  Tensor<Scalar, 1> in_x(8);
+  Tensor<Scalar, 1> out(8);
+  Tensor<Scalar, 1> expected_out(8);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(0.);
+  in_x(2) = Scalar(0.5);
+  in_x(3) = Scalar(0.2);
+  in_x(4) = Scalar(0.8);
+  in_x(5) = Scalar(0.9);
+  in_x(6) = Scalar(0.1);
+  in_x(7) = Scalar(0.99);
+  in_x(8) = Scalar(0.01);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+  expected_out(2) = Scalar(0.0);
+  expected_out(3) = Scalar(-0.8416212335729142);
+  expected_out(4) = Scalar(0.8416212335729142);
+  expected_out(5) = Scalar(1.2815515655446004);
+  expected_out(6) = Scalar(-1.2815515655446004);
+  expected_out(7) = Scalar(2.3263478740408408);
+  expected_out(8) = Scalar(-2.3263478740408408);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.ndtri();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  gpuFree(d_in_x);
+  gpuFree(d_out);
 }
 
 template <typename Scalar>
-void test_cuda_betainc()
+void test_gpu_betainc()
 {
   Tensor<Scalar, 1> in_x(125);
   Tensor<Scalar, 1> in_a(125);
@@ -1175,16 +1240,16 @@ void test_cuda_betainc()
   Scalar* d_in_a;
   Scalar* d_in_b;
   Scalar* d_out;
-  cudaMalloc((void**)(&d_in_x), bytes);
-  cudaMalloc((void**)(&d_in_a), bytes);
-  cudaMalloc((void**)(&d_in_b), bytes);
-  cudaMalloc((void**)(&d_out), bytes);
+  gpuMalloc((void**)(&d_in_x), bytes);
+  gpuMalloc((void**)(&d_in_a), bytes);
+  gpuMalloc((void**)(&d_in_b), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
 
-  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
@@ -1194,8 +1259,8 @@ void test_cuda_betainc()
 
   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
   for (int i = 1; i < 125; ++i) {
     if ((std::isnan)(expected_out(i))) {
@@ -1205,83 +1270,374 @@ void test_cuda_betainc()
     }
   }
 
-  cudaFree(d_in_x);
-  cudaFree(d_in_a);
-  cudaFree(d_in_b);
-  cudaFree(d_out);
+  gpuFree(d_in_x);
+  gpuFree(d_in_a);
+  gpuFree(d_in_b);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i0e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
+      0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+      0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+      0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+      0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+      0.0897803118848;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i1e()
+{
+  Tensor<Scalar, 1> in_x(21);
+  Tensor<Scalar, 1> out(21);
+  Tensor<Scalar, 1> expected_out(21);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_x_array(21);
+  Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+  in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+      -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+  expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
+      -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+      -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+      0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+      0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+      0.0875062221833;
+
+  for (int i = 0; i < 21; ++i) {
+    in_x(i) = in_x_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_in), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+  gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 21; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_in);
+  gpuFree(d_out);
 }
 
+template <typename Scalar>
+void test_gpu_igamma_der_a()
+{
+  Tensor<Scalar, 1> in_x(30);
+  Tensor<Scalar, 1> in_a(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_a_array(30);
+  Array<Scalar, 1, Dynamic> in_x_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+      1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+      100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
+      -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
+      -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
+      -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
+      -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
+      -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
+      -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
+      -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+  for (int i = 0; i < 30; ++i) {
+    in_x(i) = in_x_array(i);
+    in_a(i) = in_a_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_a), bytes);
+  gpuMalloc((void**)(&d_x), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
 
-void test_cxx11_tensor_cuda()
+  gpuFree(d_a);
+  gpuFree(d_x);
+  gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_gamma_sample_der_alpha()
 {
-  CALL_SUBTEST_1(test_cuda_nullary());
-  CALL_SUBTEST_1(test_cuda_elementwise_small());
-  CALL_SUBTEST_1(test_cuda_elementwise());
-  CALL_SUBTEST_1(test_cuda_props());
-  CALL_SUBTEST_1(test_cuda_reduction());
-  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
-  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
-  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
+  Tensor<Scalar, 1> in_alpha(30);
+  Tensor<Scalar, 1> in_sample(30);
+  Tensor<Scalar, 1> out(30);
+  Tensor<Scalar, 1> expected_out(30);
+  out.setZero();
+
+  Array<Scalar, 1, Dynamic> in_alpha_array(30);
+  Array<Scalar, 1, Dynamic> in_sample_array(30);
+  Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+  // See special_functions.cpp for the Python code that generates the test data.
+
+  in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
+      1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
+      100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+  in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+      1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+      0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+      1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+      7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+      92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+      968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+  expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+      1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+      0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+      1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+      0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+      1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+      0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+      1.00106492525, 0.97734200649, 1.02198794179;
+
+  for (int i = 0; i < 30; ++i) {
+    in_alpha(i) = in_alpha_array(i);
+    in_sample(i) = in_sample_array(i);
+    expected_out(i) = expected_out_array(i);
+  }
+
+  std::size_t bytes = in_alpha.size() * sizeof(Scalar);
+
+  Scalar* d_alpha;
+  Scalar* d_sample;
+  Scalar* d_out;
+  gpuMalloc((void**)(&d_alpha), bytes);
+  gpuMalloc((void**)(&d_sample), bytes);
+  gpuMalloc((void**)(&d_out), bytes);
+
+  gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
+  gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
+
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+  gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
+
+  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+                         gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+  for (int i = 0; i < 30; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  gpuFree(d_alpha);
+  gpuFree(d_sample);
+  gpuFree(d_out);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
+{
+  CALL_SUBTEST_1(test_gpu_nullary());
+  CALL_SUBTEST_1(test_gpu_elementwise_small());
+  CALL_SUBTEST_1(test_gpu_elementwise());
+  CALL_SUBTEST_1(test_gpu_props());
+  CALL_SUBTEST_1(test_gpu_reduction());
+  CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+// they hang..need to investigate and fix
+  CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
+#endif
+
+#if EIGEN_GPU_TEST_C99_MATH
   // std::erf, std::erfc, and so on where only added in c++11. We use them
   // as a golden reference to validate the results produced by Eigen. Therefore
   // we can only run these tests if we use a c++11 compiler.
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
-  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
-  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
-
-  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
-
-  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
-  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
-  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
-
-  CALL_SUBTEST_5(test_cuda_digamma<float>());
-  CALL_SUBTEST_5(test_cuda_digamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_polygamma<float>());
-  CALL_SUBTEST_5(test_cuda_polygamma<double>());
-
-  CALL_SUBTEST_5(test_cuda_zeta<float>());
-  CALL_SUBTEST_5(test_cuda_zeta<double>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<float>());
-  CALL_SUBTEST_5(test_cuda_igammac<float>());
-
-  CALL_SUBTEST_5(test_cuda_igamma<double>());
-  CALL_SUBTEST_5(test_cuda_igammac<double>());
-
-  CALL_SUBTEST_6(test_cuda_betainc<float>());
-  CALL_SUBTEST_6(test_cuda_betainc<double>());
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
+  // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+
+  CALL_SUBTEST_5(test_gpu_ndtri<float>());
+  CALL_SUBTEST_5(test_gpu_ndtri<double>());
+
+  CALL_SUBTEST_5(test_gpu_digamma<float>());
+  CALL_SUBTEST_5(test_gpu_digamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_polygamma<float>());
+  CALL_SUBTEST_5(test_gpu_polygamma<double>());
+
+  CALL_SUBTEST_5(test_gpu_zeta<float>());
+  CALL_SUBTEST_5(test_gpu_zeta<double>());
+#endif
+
+  CALL_SUBTEST_5(test_gpu_igamma<float>());
+  CALL_SUBTEST_5(test_gpu_igammac<float>());
+
+  CALL_SUBTEST_5(test_gpu_igamma<double>());
+  CALL_SUBTEST_5(test_gpu_igammac<double>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+  CALL_SUBTEST_6(test_gpu_betainc<float>());
+  CALL_SUBTEST_6(test_gpu_betainc<double>());
+
+  CALL_SUBTEST_6(test_gpu_i0e<float>());
+  CALL_SUBTEST_6(test_gpu_i0e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_i1e<float>());
+  CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
+  CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
+
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
+  CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
+#endif
+
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
index 5fd88fa6c..c20edd9ac 100644
--- a/unsupported/test/cxx11_tensor_ifft.cpp
+++ b/unsupported/test/cxx11_tensor_ifft.cpp
@@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3)
   }
 }
 
-void test_cxx11_tensor_ifft() {
+EIGEN_DECLARE_TEST(cxx11_tensor_ifft) {
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
   CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 000000000..db1c0206e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 245;
+  IndexType sizeDim2 = 343;
+  IndexType sizeDim3 = 577;
+
+  array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+  Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+  Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+  typedef Eigen::DSizes<IndexType, 3> Index3;
+  Index3 strides1(1L,1L, 1L);
+  Index3 indicesStart1(1L, 0L, 0L);
+  Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+  Index3 strides2(1L,1L, 1L);
+  Index3 indicesStart2(0L, 0L, 0L);
+  Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+  Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+  tensor1.setRandom();
+  tensor2.setRandom();
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+  gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+  for (IndexType i = 0; i <slice_range[0] ; ++i) {
+    for (IndexType j = 0; j < slice_range[1]; ++j) {
+      for (IndexType k = 0; k < slice_range[2]; ++k) {
+        VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 475c59651..862f1f7f0 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -405,6 +405,57 @@ void test_patch_padding_same()
   }
 }
 
+// Verifies that SAME padding, when computed as negative values, will be clipped
+// to zero.
+void test_patch_padding_same_negative_padding_clip_to_zero() {
+  int input_depth = 1;
+  int input_rows = 15;
+  int input_cols = 1;
+  int input_batches = 1;
+  int ksize = 1;  // Corresponds to the Rows and Cols for
+                  // tensor.extract_image_patches<>.
+  int row_stride = 5;
+  int col_stride = 1;
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(
+      ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME);
+  // row padding will be computed as -2 originally and then be clipped to 0.
+  VERIFY_IS_EQUAL(result.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);    // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);          // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);          // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 3);              // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major =
+      tensor_row_major.extract_image_patches(ksize, ksize, row_stride,
+                                             col_stride, 1, 1, PADDING_SAME);
+  VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f);
+  VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f);
+
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+}
+
 void test_patch_no_extra_dim()
 {
   Tensor<float, 3> tensor(2,3,5);
@@ -746,7 +797,7 @@ void test_imagenet_patches()
   }
 }
 
-void test_cxx11_tensor_image_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch)
 {
   CALL_SUBTEST_1(test_simple_patch());
   CALL_SUBTEST_2(test_patch_no_extra_dim());
@@ -754,4 +805,5 @@ void test_cxx11_tensor_image_patch()
   CALL_SUBTEST_4(test_patch_padding_valid_same_value());
   CALL_SUBTEST_5(test_patch_padding_same());
   CALL_SUBTEST_6(test_imagenet_patches());
+  CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero());
 }
diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
new file mode 100644
index 000000000..c1828a0ec
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
@@ -0,0 +1,1092 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index colmajor " << i << " : "
+           << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index row major" << i << " : "
+           << tensor_row_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            for (IndexType b = 0; b < 7; ++b) {
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+              }
+              if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+
+}
+
+
+// Verifies VALID padding (no padding) with incrementing values.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 3;
+  IndexType input_cols = 3;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+  // ColMajor
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_result_col_major);
+  sycl_device.deallocate(gpu_data_result_row_major);
+}
+
+// Verifies VALID padding (no padding) with the same value.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 1;
+  IndexType input_rows = 5;
+  IndexType input_cols = 5;
+  IndexType input_batches = 2;
+  IndexType ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+  // ColMajor
+
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+  gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f);
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}};
+  Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+  gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r + i - row_padding;
+              IndexType col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){
+  IndexType input_depth = 3;
+  IndexType input_rows = 4;
+  IndexType input_cols = 2;
+  IndexType input_batches = 1;
+  IndexType ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  IndexType stride = 2;  // Only same stride is supported.
+
+  // ColMajor
+  array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+  array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+  // Initializes tensor with incrementing numbers.
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    tensor_col_major.data()[i] = i + 1;
+  }
+
+array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}};
+Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+DataType* gpu_data_result_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+
+  array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }};
+  Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_result_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+  gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+
+  for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      IndexType patchId = i+input_rows*j;
+      for (IndexType r = 0; r < ksize; ++r) {  // patch rows
+        for (IndexType c = 0; c < ksize; ++c) {  // patch cols
+          for (IndexType d = 0; d < input_depth; ++d) {  // depth
+            for (IndexType b = 0; b < input_batches; ++b) {  // batch
+              DataType expected_col_major = 0.0f;
+              DataType expected_row_major = 0.0f;
+              IndexType row_offset = r*stride + i - row_padding;
+              IndexType col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+template <typename DataType, typename IndexType>
+static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+
+  // ColMajor
+  array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}};
+  Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  tensor_col_major.setRandom();
+  Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0));
+
+
+  // Single pixel patch: ColMajor
+  array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+  gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3);
+
+  // Single pixel patch: RowMajor
+  array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+  sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+    // ColMajor
+    if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor_col_major.data()[i] << " vs "
+           << single_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+                    single_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_image_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+  gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+  sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+DataType* gpu_data_entire_image_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 3; ++r) {
+        for (IndexType c = 0; c < 5; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected_col_major = tensor_col_major(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}};
+  Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+  patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+  gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+  Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+  gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+  sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  IndexType row_padding = 0;
+  IndexType col_padding = 0;
+  IndexType stride = 1;
+
+  for (IndexType i = 0; i < 3; ++i) {
+    for (IndexType j = 0; j < 5; ++j) {
+      IndexType patchId = i+3*j;
+      for (IndexType r = 0; r < 2; ++r) {
+        for (IndexType c = 0; c < 2; ++c) {
+          for (IndexType d = 0; d < 2; ++d) {
+            DataType expected_col_major = 0.0f;
+            DataType expected_row_major = 0.0f;
+            IndexType row_offset = r*stride + i - row_padding;
+            IndexType col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+              expected_col_major = tensor_col_major(d, row_offset, col_offset);
+            }
+            if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_patch_row_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+  sycl_device.deallocate(gpu_data_twod_patch_col_major);
+  sycl_device.deallocate(gpu_data_twod_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  IndexType sizeDim1 = 3;
+  IndexType sizeDim2 = 128;
+  IndexType sizeDim3 = 128;
+  IndexType sizeDim4 = 16;
+  array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange);
+  l_in_col_major.setRandom();
+
+  DataType* gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+
+  array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange);
+  size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange);
+  gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4);
+
+  // RowMajor
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}};
+  Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1);
+
+  for (IndexType b = 0; b < 16; ++b) {
+    for (IndexType i = 0; i < 128; ++i) {
+      for (IndexType j = 0; j < 128; ++j) {
+        IndexType patchId = i+128*j;
+        for (IndexType c = 0; c < 11; ++c) {
+          for (IndexType r = 0; r < 11; ++r) {
+            for (IndexType d = 0; d < 3; ++d) {
+              DataType expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in_col_major(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 16;
+  sizeDim2 = 64;
+  sizeDim3 = 64;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+// RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 64; ++i) {
+      for (IndexType j = 0; j < 64; ++j) {
+        IndexType patchId = i+64*j;
+        for (IndexType c = 0; c < 9; ++c) {
+          for (IndexType r = 0; r < 9; ++r) {
+            for (IndexType d = 0; d < 16; ++d) {
+              DataType expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in_col_major(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 32;
+  sizeDim2 = 16;
+  sizeDim3 = 16;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 16; ++i) {
+      for (IndexType j = 0; j < 16; ++j) {
+        IndexType patchId = i+16*j;
+        for (IndexType c = 0; c < 7; ++c) {
+          for (IndexType r = 0; r < 7; ++r) {
+            for (IndexType d = 0; d < 32; ++d) {
+              DataType expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in_col_major(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sizeDim1 = 64;
+  sizeDim2 = 13;
+  sizeDim3 = 13;
+  sizeDim4 = 32;
+  tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  l_in_col_major.resize(tensorColMajorRange);
+  l_in_col_major.setRandom();
+  gpu_data_l_in_col_major  = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange);
+
+  patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}};
+  l_out_col_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+  gpu_data_l_out_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+  gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+  // RowMajor
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+  patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}};
+  l_out_row_major.resize(patchTensorRange);
+  patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+  gpu_data_l_out_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange);
+  gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3);
+  sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (IndexType b = 0; b < 32; ++b) {
+    for (IndexType i = 0; i < 13; ++i) {
+      for (IndexType j = 0; j < 13; ++j) {
+        IndexType patchId = i+13*j;
+        for (IndexType c = 0; c < 3; ++c) {
+          for (IndexType r = 0; r < 3; ++r) {
+            for (IndexType d = 0; d < 64; ++d) {
+              DataType expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in_col_major(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out_col_major(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_l_in_col_major);
+  sycl_device.deallocate(gpu_data_l_out_col_major);
+  sycl_device.deallocate(gpu_data_l_out_row_major);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+test_simple_image_patch_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
+test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
+test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index 4cf5df666..2166532c8 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -22,9 +22,9 @@ static void test_static_index_list()
   VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
 
   EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -167,19 +167,18 @@ static void test_type2indexpair_list()
 
   typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
   typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
-  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
-  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<Index>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<Index>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<Index>> Dims2_c;
 
-  Dims0 d0;
   Dims2_a d2_a;
 
   Dims2_b d2_b;
-  d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+  d2_b.set(1, Eigen::IndexPair<Index>(1,11));
 
   Dims2_c d2_c;
-  d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
-  d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11));  // setting type2indexpair to correct value.
-  d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+  d2_c.set(0, Eigen::IndexPair<Index>(Eigen::IndexPair<Index>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<Index>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<Index>(2,12));
 
   VERIFY_IS_EQUAL(d2_a[0].first, 0);
   VERIFY_IS_EQUAL(d2_a[0].second, 10);
@@ -278,9 +277,9 @@ static void test_dynamic_index_list()
   VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 0);
 
   Tensor<float, 1> result = tensor.sum(reduction_axis);
   for (int i = 0; i < result.size(); ++i) {
@@ -310,10 +309,10 @@ static void test_mixed_index_list()
   VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
   VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
   VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
-  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[3]), 3);
 
   typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
   ReductionIndices reduction_indices;
@@ -373,7 +372,7 @@ static void test_dim_check()
 
 #endif
 
-void test_cxx11_tensor_index_list()
+EIGEN_DECLARE_TEST(cxx11_tensor_index_list)
 {
 #ifdef EIGEN_HAS_INDEX_LIST
   CALL_SUBTEST(test_static_index_list());
diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
index 4997935e9..75089e856 100644
--- a/unsupported/test/cxx11_tensor_inflation.cpp
+++ b/unsupported/test/cxx11_tensor_inflation.cpp
@@ -74,7 +74,7 @@ static void test_simple_inflation()
   }
 }
 
-void test_cxx11_tensor_inflation()
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation)
 {
   CALL_SUBTEST(test_simple_inflation<ColMajor>());
   CALL_SUBTEST(test_simple_inflation<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
new file mode 100644
index 000000000..521ae0cc3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+// Inflation Definition for each dimension the inflated val would be
+//((dim-1)*strid[dim] +1)
+
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// tensor of size (2*3) +1 = 7 with the value of
+// (4, 0, 0, 4, 0, 0, 4).
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) {
+
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_stride  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  IndexType inflatedSizeDim1 = 3;
+  IndexType inflatedSizeDim2 = 9;
+  IndexType inflatedSizeDim3 = 9;
+  IndexType inflatedSizeDim4 = 19;
+  array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange);
+
+  const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType);
+  DataType* gpu_data_inflated  = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange);
+  gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides);
+  sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1);
+  VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2);
+  VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3);
+  VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4);
+
+  for (IndexType i = 0; i < inflatedSizeDim1; ++i) {
+    for (IndexType j = 0; j < inflatedSizeDim2; ++j) {
+      for (IndexType k = 0; k < inflatedSizeDim3; ++k) {
+        for (IndexType l = 0; l < inflatedSizeDim4; ++l) {
+          if (i % strides[0] == 0 &&
+              j % strides[1] == 0 &&
+              k % strides[2] == 0 &&
+              l % strides[3] == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3]));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_stride);
+  sycl_device.deallocate(gpu_data_inflated);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 8e2b70b75..d18a05ec4 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -135,7 +135,7 @@ void test_specific() {
   VERIFY_IS_EQUAL(result, result_op);
 }
 
-void test_cxx11_tensor_intdiv()
+EIGEN_DECLARE_TEST(cxx11_tensor_intdiv)
 {
   CALL_SUBTEST_1(test_signed_32bit());
   CALL_SUBTEST_2(test_unsigned_32bit());
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 489960529..2c638f9bf 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -119,7 +119,7 @@ static void test_output_const()
 }
 
 
-void test_cxx11_tensor_io()
+EIGEN_DECLARE_TEST(cxx11_tensor_io)
 {
   CALL_SUBTEST(test_output_0d<ColMajor>());
   CALL_SUBTEST(test_output_0d<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
index ae297a9da..efb333360 100644
--- a/unsupported/test/cxx11_tensor_layout_swap.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -54,7 +54,7 @@ static void test_swap_as_lvalue()
 }
 
 
-void test_cxx11_tensor_layout_swap()
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap)
 {
   CALL_SUBTEST(test_simple_swap());
   CALL_SUBTEST(test_swap_as_lvalue());
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
new file mode 100644
index 000000000..9546b911c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.swap_layout();
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  //tensor.setRandom();
+
+//  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, typename IndexType>
+static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 7;
+  array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+  Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+  Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+  TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+  gpu2.swap_layout().device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+//  Tensor<float, 3, ColMajor> tensor(2,3,7);
+//  tensor.setRandom();
+
+  //Tensor<float, 3, RowMajor> tensor2(7,3,2);
+//  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_swap_sycl<DataType, int64_t>(sycl_device);
+  test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
index 071f5b406..6ba9a212d 100644
--- a/unsupported/test/cxx11_tensor_lvalue.cpp
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -36,7 +36,7 @@ static void test_compound_assignment()
 }
 
 
-void test_cxx11_tensor_lvalue()
+EIGEN_DECLARE_TEST(cxx11_tensor_lvalue)
 {
   CALL_SUBTEST(test_compound_assignment());
 }
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 3db0ee7c0..4d4f68911 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -19,8 +19,8 @@ static void test_0d()
   Tensor<int, 0> scalar1;
   Tensor<int, 0, RowMajor> scalar2;
 
-  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
-  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+  TensorMap<const Tensor<int, 0> > scalar3(scalar1.data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
 
   scalar1() = 7;
   scalar2() = 13;
@@ -37,8 +37,8 @@ static void test_1d()
   Tensor<int, 1> vec1(6);
   Tensor<int, 1, RowMajor> vec2(6);
 
-  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
-  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+  TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6);
+  TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6);
 
   vec1(0) = 4;  vec2(0) = 0;
   vec1(1) = 8;  vec2(1) = 1;
@@ -85,8 +85,8 @@ static void test_2d()
   mat2(1,1) = 4;
   mat2(1,2) = 5;
 
-  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
-  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+  TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
 
   VERIFY_IS_EQUAL(mat3.rank(), 2);
   VERIFY_IS_EQUAL(mat3.size(), 6);
@@ -129,8 +129,8 @@ static void test_3d()
     }
   }
 
-  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
-  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
 
   VERIFY_IS_EQUAL(mat3.rank(), 3);
   VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -265,7 +265,54 @@ static void test_casting()
   VERIFY_IS_EQUAL(sum1, 861);
 }
 
-void test_cxx11_tensor_map()
+template<typename T>
+static const T& add_const(T& value) {
+  return value;
+}
+
+static void test_0d_const_tensor()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data());
+  TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_0d_const_tensor_map()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  const TensorMap<Tensor<int, 0> > scalar3(scalar1.data());
+  const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+  // Although TensorMap is constant, we still can write to the underlying
+  // storage, because we map over non-constant Tensor.
+  scalar3() = 7;
+  scalar4() = 13;
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+
+  // Pointer to the underlying storage is also non-const.
+  scalar3.data()[0] = 8;
+  scalar4.data()[0] = 14;
+
+  VERIFY_IS_EQUAL(scalar1(), 8);
+  VERIFY_IS_EQUAL(scalar2(), 14);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_map)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
@@ -274,4 +321,7 @@ void test_cxx11_tensor_map()
 
   CALL_SUBTEST(test_from_tensor());
   CALL_SUBTEST(test_casting());
+
+  CALL_SUBTEST(test_0d_const_tensor());
+  CALL_SUBTEST(test_0d_const_tensor_map());
 }
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index 61c742a16..82a1a26d8 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -39,7 +39,7 @@ static void test_sigmoid()
 }
 
 
-void test_cxx11_tensor_math()
+EIGEN_DECLARE_TEST(cxx11_tensor_math)
 {
   CALL_SUBTEST(test_tanh());
   CALL_SUBTEST(test_sigmoid());
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 000000000..029653e27
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.tanh();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.tanh();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+  IndexType sizeDim1 = 4;
+  IndexType sizeDim2 = 4;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+  in = in.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+  gpu2.device(sycl_device) = gpu1.sigmoid();
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+  out_cpu=in.sigmoid();
+
+  for (int i = 0; i < in.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), out_cpu(i));
+  }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 4fba6fdd1..ee2616fd7 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -47,7 +47,7 @@ static void test_simple()
 }
 
 
-void test_cxx11_tensor_mixed_indices()
+EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices)
 {
   CALL_SUBTEST(test_simple());
 }
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index f7de43110..ed5d5ade3 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -41,7 +41,29 @@ static void test_simple_reshape()
   }
 }
 
-template<typename>
+template <typename>
+static void test_static_reshape() {
+#if defined(EIGEN_HAS_INDEX_LIST)
+  using Eigen::type2index;
+
+  Tensor<float, 5> tensor(2, 3, 1, 7, 1);
+  tensor.setRandom();
+
+  // New dimensions: [2, 3, 7]
+  Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
+  Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<ptrdiff_t,3>>(dim));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i, j, 0, k, 0), reshaped(i, j, k));
+      }
+    }
+  }
+#endif
+}
+
+template <typename>
 static void test_reshape_in_expr() {
   MatrixXf m1(2,3*5*7*11);
   MatrixXf m2(3*5*7*11,13);
@@ -90,19 +112,19 @@ static void test_reshape_as_lvalue()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_simple_slice()
 {
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
   tensor.setRandom();
 
-  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Tensor<T, 5, DataLayout> slice1(1,1,1,1,1);
   Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
   slice1 = tensor.slice(indices, sizes);
   VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
 
-  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Tensor<T, 5, DataLayout> slice2(1,1,2,2,3);
   Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
   Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
   slice2 = tensor.slice(indices2, sizes2);
@@ -115,20 +137,20 @@ static void test_simple_slice()
   }
 }
 
-template<typename=void>
+template<typename T>
 static void test_const_slice()
 {
-  const float b[1] = {42};
-  TensorMap<Tensor<const float, 1> > m(b, 1);
+  const T b[1] = {42};
+  TensorMap<Tensor<const T, 1> > m(b, 1);
   DSizes<DenseIndex, 1> offsets;
   offsets[0] = 0;
-  TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions()));
   VERIFY_IS_EQUAL(slice_ref(0), 42);
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_in_expr() {
-  typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+  typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx;
   Mtx m1(7,7);
   Mtx m2(3,3);
   m1.setRandom();
@@ -136,10 +158,10 @@ static void test_slice_in_expr() {
 
   Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
 
-  TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
-  TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
-  Tensor<float, 2, DataLayout> tensor3(3,1);
-  typedef Tensor<float, 1>::DimensionPair DimPair;
+  TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  Tensor<T, 2, DataLayout> tensor3(3,1);
+  typedef typename Tensor<T, 1>::DimensionPair DimPair;
   array<DimPair, 1> contract_along{{DimPair(1, 0)}};
 
   Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
@@ -156,28 +178,28 @@ static void test_slice_in_expr() {
   }
 
   // Take an arbitrary slice of an arbitrarily sized tensor.
-  TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
-  Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
   for (int i = 0; i < 35; ++i) {
     VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_as_lvalue()
 {
-  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  Tensor<T, 3, DataLayout> tensor1(2,2,7);
   tensor1.setRandom();
-  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  Tensor<T, 3, DataLayout> tensor2(2,2,7);
   tensor2.setRandom();
-  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  Tensor<T, 3, DataLayout> tensor3(4,3,5);
   tensor3.setRandom();
-  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  Tensor<T, 3, DataLayout> tensor4(4,3,2);
   tensor4.setRandom();
-  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  Tensor<T, 3, DataLayout> tensor5(10,13,12);
   tensor5.setRandom();
 
-  Tensor<float, 3, DataLayout> result(4,5,7);
+  Tensor<T, 3, DataLayout> result(4,5,7);
   Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
   Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
   result.slice(first_slice, sizes12) = tensor1;
@@ -223,10 +245,10 @@ static void test_slice_as_lvalue()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_slice_raw_data()
 {
-  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  Tensor<T, 4, DataLayout> tensor(3,5,7,11);
   tensor.setRandom();
 
   Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
@@ -253,7 +275,7 @@ static void test_slice_raw_data()
   extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
   auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
   VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
-  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0));
 
   if (DataLayout == ColMajor) {
     offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
@@ -318,15 +340,15 @@ static void test_slice_raw_data()
 }
 
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice()
 {
-  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Tensor<T, 5, DataLayout> Tensor5f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
-  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
-  Tensor<float, 2, DataLayout> tensor2(7,11);
+  Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<T, 2, DataLayout> tensor2(7,11);
   tensor.setRandom();
   tensor2.setRandom();
 
@@ -412,13 +434,13 @@ static void test_strided_slice()
   }
 }
 
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_strided_slice_write()
 {
-  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Tensor<T, 2, DataLayout> Tensor2f;
   typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
 
-  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11);
   tensor.setRandom();
   tensor2=tensor;
   Tensor2f slice(2,3);
@@ -438,15 +460,14 @@ static void test_strided_slice_write()
   }
 }
 
-
-template<int DataLayout>
+template<typename T, int DataLayout>
 static void test_composition()
 {
-  Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+  Eigen::Tensor<T, 2, DataLayout> matrix(7, 11);
   matrix.setRandom();
 
   const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
-  Eigen::Tensor<float, 3, DataLayout> tensor =
+  Eigen::Tensor<T, 3, DataLayout> tensor =
       matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
 
   VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
@@ -458,28 +479,87 @@ static void test_composition()
   }
 }
 
+template<typename T, int DataLayout>
+static void test_empty_slice()
+{
+  Tensor<T, 3, DataLayout> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<T, 3, DataLayout> copy = tensor;
+
+  // empty size in first dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices1(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes1(0,1,2);
+  Tensor<T, 3, DataLayout> slice1(0,1,2);
+  slice1.setRandom();
+  tensor.slice(indices1, sizes1) = slice1;
+
+  // empty size in second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices2(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes2(1,0,2);
+  Tensor<T, 3, DataLayout> slice2(1,0,2);
+  slice2.setRandom();
+  tensor.slice(indices2, sizes2) = slice2;
+
+  // empty size in third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices3(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(1,1,0);
+  Tensor<T, 3, DataLayout> slice3(1,1,0);
+  slice3.setRandom();
+  tensor.slice(indices3, sizes3) = slice3;
+
+  // empty size in first and second dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices4(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(0,0,2);
+  Tensor<T, 3, DataLayout> slice4(0,0,2);
+  slice4.setRandom();
+  tensor.slice(indices4, sizes4) = slice4;
+
+  // empty size in second and third dimension
+  Eigen::DSizes<ptrdiff_t, 3> indices5(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(1,0,0);
+  Tensor<T, 3, DataLayout> slice5(1,0,0);
+  slice5.setRandom();
+  tensor.slice(indices5, sizes5) = slice5;
+
+  // empty size in all dimensions
+  Eigen::DSizes<ptrdiff_t, 3> indices6(1,2,3);
+  Eigen::DSizes<ptrdiff_t, 3> sizes6(0,0,0);
+  Tensor<T, 3, DataLayout> slice6(0,0,0);
+  slice6.setRandom();
+  tensor.slice(indices6, sizes6) = slice6;
+
+  // none of these operations should change the tensor's components
+  // because all of the rvalue slices have at least one zero dimension
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+          VERIFY_IS_EQUAL(tensor(i,j,k), copy(i,j,k));
+      }
+    }
+  }
+}
+
+#define CALL_SUBTEST_PART(PART) \
+  CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME)       \
+  CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \
+  CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>()))
 
-void test_cxx11_tensor_morphing()
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
 {
   CALL_SUBTEST_1(test_simple_reshape<void>());
-  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_static_reshape<void>());
   CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
-
-  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
-  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
-  CALL_SUBTEST_1(test_const_slice());
-  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
-  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
-  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
-  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
-  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
-  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
-
-  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
-  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
-  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
-  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
-
-  CALL_SUBTEST_7(test_composition<ColMajor>());
-  CALL_SUBTEST_7(test_composition<RowMajor>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_const_slice<float>());
+
+  CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue);
+  CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data);
+  CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write);
+  CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice);
+  CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition);
 }
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
new file mode 100644
index 000000000..bf001b40f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+  typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
+
+  Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
+
+  tensor1.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+  DataType* gpu_data4  = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+  TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+
+  gpu2.device(sycl_device)=gpu1.reshape(dim2);
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType));
+
+  gpu3.device(sycl_device)=gpu1.reshape(dim3);
+  sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+  gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
+  sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      ///ColMajor
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));    ///ColMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));    ///ColMajor
+        }
+        else{
+          //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k));   /// RowMajor
+          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+  sycl_device.deallocate(gpu_data4);
+}
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+  typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+  typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+  typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
+
+  tensor.setRandom();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
+
+  TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+  TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+  TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+
+  gpu2.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType));
+
+  gpu3.reshape(dim1).device(sycl_device)=gpu1;
+  sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
+
+
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+          VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));    ///ColMajor
+        }
+        else{
+          VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k));   /// RowMajor
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  typedef Eigen::DSizes<IndexType, 5> Index5;
+  Index5 strides(1L,1L,1L,1L,1L);
+  Index5 indicesStart(1L,2L,3L,4L,5L);
+  Index5 indicesStop(2L,3L,4L,5L,6L);
+  Index5 lengths(1L,1L,1L,1L,1L);
+
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+  Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+  DataType* gpu_data_stride2  = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+  gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+  sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+  Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+  DataType* gpu_data_stride3  = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+  Index5 strides2(1L,1L,1L,1L,1L);
+  Index5 indicesStart2(1L,1L,3L,4L,5L);
+  Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+  gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+  sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f;
+  typedef Eigen::DSizes<IndexType, 2> Index2;
+  IndexType sizeDim1 = 7L;
+  IndexType sizeDim2 = 11L;
+  array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+  Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange);
+  IndexType sliceDim1 = 2;
+  IndexType sliceDim2 = 3;
+  array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}};
+  Tensor2f slice(sliceRange);
+  Index2 strides(1L,1L);
+  Index2 indicesStart(3L,4L);
+  Index2 indicesStop(5L,7L);
+  Index2 lengths(2L,3L);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange);
+
+
+  tensor.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1;
+
+  slice.setRandom();
+  sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType));
+
+
+  gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
+  gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
+  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+  for(IndexType i=0;i<sizeDim1;i++)
+    for(IndexType j=0;j<sizeDim2;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+  sycl_device.deallocate(gpu_data3);
+}
+
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::array<OutIndex, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+  using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+  using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+  using TensorMI64 = TensorMap<TensorI64>;
+  using TensorMI32 = TensorMap<TensorI32>;
+  Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+  Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+  TensorI64 out_tensor_gpu(tensor_range);
+  TensorI64 out_tensor_cpu(tensor_range);
+  out_tensor_cpu.setRandom();
+
+  TensorI64 sub_tensor(slice_range);
+  sub_tensor.setRandom();
+
+  DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+  DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+  TensorMI64 out_gpu(out_gpu_data, tensor_range);
+  TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+  sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+  Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+  Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+  TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+  TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+  TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+  TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+  out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+  out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+  sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+  int has_err = 0;
+  for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+    auto exp = out_tensor_cpu(i);
+    auto val = out_tensor_gpu(i);
+    if (val != exp) {
+      std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+      has_err = 1;
+    }
+  }
+  sycl_device.deallocate(out_gpu_data);
+  sycl_device.deallocate(sub_gpu_data);
+  return has_err;
+}
+
+template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  run_eigen<float, RowMajor, long, int>(sycl_device); 
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp
new file mode 100644
index 000000000..a2982319f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_move.cpp
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <utility>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void calc_indices(int i, int& x, int& y, int& z)
+{
+  x = i / 4;
+  y = (i % 4) / 2;
+  z = i % 2;
+}
+
+static void test_move()
+{
+  int x;
+  int y;
+  int z;
+
+  Tensor<int,3> tensor1(2, 2, 2);
+  Tensor<int,3,RowMajor> tensor2(2, 2, 2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    tensor1(x,y,z) = i;
+    tensor2(x,y,z) = 2 * i;
+  }
+
+  // Invokes the move constructor.
+  Tensor<int,3> moved_tensor1 = std::move(tensor1);
+  Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2);
+
+  VERIFY_IS_EQUAL(tensor1.size(), 0);
+  VERIFY_IS_EQUAL(tensor2.size(), 0);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i);
+  }
+
+  Tensor<int,3> moved_tensor3(2,2,2);
+  Tensor<int,3,RowMajor> moved_tensor4(2,2,2);
+
+  moved_tensor3.setZero();
+  moved_tensor4.setZero();
+
+  // Invokes the move assignment operator.
+  moved_tensor3 = std::move(moved_tensor1);
+  moved_tensor4 = std::move(moved_tensor2);
+
+  for (int i = 0; i < 8; i++)
+  {
+    calc_indices(i, x, y, z);
+    VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i);
+    VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i);
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_move)
+{
+  CALL_SUBTEST(test_move());
+}
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index c946007b8..8e8165302 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -9,38 +9,21 @@
 
 #define EIGEN_USE_THREADS
 
+#include <atomic>
+
 #include <stdlib.h>
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#include <windows.h>
-void sleep(int seconds) {
-  Sleep(seconds*1000);
-}
-#else
-#include <unistd.h>
-#endif
-
-
-namespace {
-
-void WaitAndAdd(Eigen::Notification* n, int* counter) {
-  n->Wait();
-  *counter = *counter + 1;
-}
-
-}  // namespace
-
 static void test_notification_single()
 {
   ThreadPool thread_pool(1);
 
-  int counter = 0;
+  std::atomic<int> counter(0);
   Eigen::Notification n;
-  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
   thread_pool.Schedule(func);
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
   // The thread should be waiting for the notification.
   VERIFY_IS_EQUAL(counter, 0);
@@ -48,7 +31,7 @@ static void test_notification_single()
   // Unblock the thread
   n.Notify();
 
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
   // Verify the counter has been incremented
   VERIFY_IS_EQUAL(counter, 1);
@@ -60,21 +43,21 @@ static void test_notification_multiple()
 {
   ThreadPool thread_pool(1);
 
-  int counter = 0;
+  std::atomic<int> counter(0);
   Eigen::Notification n;
-  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  auto func = [&n, &counter](){ n.Wait(); ++counter;};
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
   thread_pool.Schedule(func);
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   VERIFY_IS_EQUAL(counter, 0);
   n.Notify();
-  sleep(1);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   VERIFY_IS_EQUAL(counter, 4);
 }
 
-void test_cxx11_tensor_notification()
+EIGEN_DECLARE_TEST(cxx11_tensor_notification)
 {
   CALL_SUBTEST(test_notification_single());
   CALL_SUBTEST(test_notification_multiple());
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index e9d1b2d3c..99e18076a 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -94,7 +94,7 @@ static void test_contractions()
 }
 
 
-void test_cxx11_tensor_of_complex()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_complex)
 {
   CALL_SUBTEST(test_additions());
   CALL_SUBTEST(test_abs());
diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
index f179a0c21..344d678ef 100644
--- a/unsupported/test/cxx11_tensor_of_const_values.cpp
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -97,7 +97,7 @@ static void test_plus_equal()
 }
 
 
-void test_cxx11_tensor_of_const_values()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values)
 {
   CALL_SUBTEST(test_assign());
   CALL_SUBTEST(test_plus());
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 2f86980a2..30bcc1d28 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -9,21 +9,19 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+
 using Eigen::Tensor;
 
 template<typename>
-void test_cuda_numext() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_numext() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -59,14 +57,14 @@ void test_cuda_numext() {
 }
 
 
-#ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_GPU_FP16
 
 template<typename>
-void test_cuda_conversion() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
-  
+
   float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
@@ -97,8 +95,8 @@ void test_cuda_conversion() {
 }
 
 template<typename>
-void test_cuda_unary() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_unary() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -134,8 +132,8 @@ void test_cuda_unary() {
 }
 
 template<typename>
-void test_cuda_elementwise() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_elementwise() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -176,8 +174,8 @@ void test_cuda_elementwise() {
 }
 
 template<typename>
-void test_cuda_trancendental() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_trancendental() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -200,6 +198,8 @@ void test_cuda_trancendental() {
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
@@ -207,6 +207,7 @@ void test_cuda_trancendental() {
   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
   gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
 
   gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
   gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
@@ -217,6 +218,9 @@ void test_cuda_trancendental() {
   gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
   gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
 
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
   Tensor<float, 1> input1(num_elem);
   Tensor<Eigen::half, 1> half_prec1(num_elem);
   Tensor<Eigen::half, 1> full_prec1(num_elem);
@@ -243,7 +247,7 @@ void test_cuda_trancendental() {
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
     else
       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
@@ -264,8 +268,8 @@ void test_cuda_trancendental() {
 }
 
 template<typename>
-void test_cuda_contractions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_contractions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int rows = 23;
   int cols = 23;
@@ -315,36 +319,32 @@ void test_cuda_contractions() {
 }
 
 template<typename>
-void test_cuda_reductions(int size1, int size2, int redux) {
+void test_gpu_reductions(int size1, int size2, int redux) {
 
    std::cout << "Reducing " << size1 << " by " << size2
-             << " tensor along dim " << redux << std::endl; 
+             << " tensor along dim " << redux << std::endl;
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = size1*size2;
   int result_size = (redux == 1 ? size1 : size2);
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
       d_res_half, result_size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, result_size);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
-  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+  gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
 
-  Eigen::array<int, 1> redux_dim = {{redux}};
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+  Eigen::array<int, 1> redux_dim = {redux};
+  gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
 
   Tensor<Eigen::half, 1> half_prec(result_size);
   Tensor<Eigen::half, 1> full_prec(result_size);
@@ -357,50 +357,45 @@ void test_cuda_reductions(int size1, int size2, int redux) {
     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
   }
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
 
 template<typename>
-void test_cuda_reductions() {
-  test_cuda_reductions<void>(13, 13, 0);
-  test_cuda_reductions<void>(13, 13, 1);
+void test_gpu_reductions() {
+  test_gpu_reductions<void>(13, 13, 0);
+  test_gpu_reductions<void>(13, 13, 1);
 
-  test_cuda_reductions<void>(35, 36, 0);
-  test_cuda_reductions<void>(35, 36, 1);
+  test_gpu_reductions<void>(35, 36, 0);
+  test_gpu_reductions<void>(35, 36, 1);
 
-  test_cuda_reductions<void>(36, 35, 0);
-  test_cuda_reductions<void>(36, 35, 1);
+  test_gpu_reductions<void>(36, 35, 0);
+  test_gpu_reductions<void>(36, 35, 1);
 }
 
 template<typename>
-void test_cuda_full_reductions() {
-  Eigen::CudaStreamDevice stream;
+void test_gpu_full_reductions() {
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int size = 13;
   int num_elem = size*size;
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
       d_res_half);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
       d_res_float);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_float.device(gpu_device) = gpu_float.random();
 
-  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+  gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
 
   Tensor<Eigen::half, 0> half_prec;
   Tensor<Eigen::half, 0> full_prec;
@@ -410,24 +405,23 @@ void test_cuda_full_reductions() {
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
   gpu_device.synchronize();
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
 
 template<typename>
-void test_cuda_forced_evals() {
+void test_gpu_forced_evals() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
   int num_elem = 101;
 
@@ -440,7 +434,7 @@ void test_cuda_forced_evals() {
       d_float, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
       d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
       d_res_half2, num_elem);
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, num_elem);
@@ -457,7 +451,7 @@ void test_cuda_forced_evals() {
   Tensor<float, 1> half_prec2(num_elem);
   Tensor<float, 1> full_prec(num_elem);
   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
-  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
   gpu_device.synchronize();
 
@@ -475,20 +469,20 @@ void test_cuda_forced_evals() {
 #endif
 
 
-void test_cxx11_tensor_of_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_numext<void>());
-
-#ifdef EIGEN_HAS_CUDA_FP16
-  CALL_SUBTEST_1(test_cuda_conversion<void>());
-  CALL_SUBTEST_1(test_cuda_unary<void>());
-  CALL_SUBTEST_1(test_cuda_elementwise<void>());
-  CALL_SUBTEST_1(test_cuda_trancendental<void>());
-  CALL_SUBTEST_2(test_cuda_contractions<void>());
-  CALL_SUBTEST_3(test_cuda_reductions<void>());
-  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
-  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+  CALL_SUBTEST_1(test_gpu_numext<void>());
+
+#ifdef EIGEN_HAS_GPU_FP16
+  CALL_SUBTEST_1(test_gpu_conversion<void>());
+  CALL_SUBTEST_1(test_gpu_unary<void>());
+  CALL_SUBTEST_1(test_gpu_elementwise<void>());
+  CALL_SUBTEST_1(test_gpu_trancendental<void>());
+  CALL_SUBTEST_2(test_gpu_contractions<void>());
+  CALL_SUBTEST_3(test_gpu_reductions<void>());
+  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
+  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
 #else
-  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index 4ef9aed91..159656276 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -141,7 +141,7 @@ static void test_initialization()
 }
 
 
-void test_cxx11_tensor_of_strings()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_strings)
 {
   // Beware: none of this is likely to ever work on a GPU.
   CALL_SUBTEST(test_assign());
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
index ffa19896e..b8a329deb 100644
--- a/unsupported/test/cxx11_tensor_padding.cpp
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -84,7 +84,7 @@ static void test_padded_expr()
   }
 }
 
-void test_cxx11_tensor_padding()
+EIGEN_DECLARE_TEST(cxx11_tensor_padding)
 {
   CALL_SUBTEST(test_simple_padding<ColMajor>());
   CALL_SUBTEST(test_simple_padding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
new file mode 100644
index 000000000..727a9ffd7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
+{
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  IndexType padedSizeDim1 = 2;
+  IndexType padedSizeDim2 = 6;
+  IndexType padedSizeDim3 = 12;
+  IndexType padedSizeDim4 = 7;
+  array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange);
+
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings);
+  sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
+  for (IndexType i = 0; i < padedSizeDim1; ++i) {
+    for (IndexType j = 0; j < padedSizeDim2; ++j) {
+      for (IndexType k = 0; k < padedSizeDim3; ++k) {
+        for (IndexType l = 0; l < padedSizeDim4; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
+{
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  tensor.setRandom();
+
+  array<std::pair<IndexType, IndexType>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<IndexType, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+
+  Tensor<DataType, 2, DataLayout, IndexType>  result(reshape_dims);
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims);
+
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+  gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
+  sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 6; ++j) {
+      for (IndexType k = 0; k < 12; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
+  test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
index 434359730..498ab8ca7 100644
--- a/unsupported/test/cxx11_tensor_patch.cpp
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -164,7 +164,7 @@ static void test_simple_patch()
   }
 }
 
-void test_cxx11_tensor_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_patch)
 {
    CALL_SUBTEST(test_simple_patch<ColMajor>());
    CALL_SUBTEST(test_simple_patch<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
new file mode 100644
index 000000000..7f92bec78
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){
+
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  array<IndexType, 5> patchTensorRange;
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}};
+  }else{
+     patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}};
+  }
+
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange);
+
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> patch_dims;
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+  size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType);
+  DataType* gpu_data_tensor  = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+  DataType* gpu_data_no_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+  gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}};
+  }else{
+     patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange);
+  patchTensorBuffSize =single_patch.size()*sizeof(DataType);
+  DataType* gpu_data_single_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange);
+
+  gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,2,1,2*2*4*7}};
+  }else{
+     patchTensorRange = {{2*2*4*7, 1, 2,2,1}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange);
+  patchTensorBuffSize =twod_patch.size()*sizeof(DataType);
+  DataType* gpu_data_twod_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange);
+
+  gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+
+  if (DataLayout == ColMajor) {
+   patchTensorRange = {{1,2,3,5,2*2*3*3}};
+  }else{
+     patchTensorRange = {{2*2*3*3, 1, 2,3,5}};
+  }
+  Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange);
+  patchTensorBuffSize =threed_patch.size()*sizeof(DataType);
+  DataType* gpu_data_threed_patch  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange);
+
+  gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+  sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_tensor);
+  sycl_device.deallocate(gpu_data_no_patch);
+  sycl_device.deallocate(gpu_data_single_patch);
+  sycl_device.deallocate(gpu_data_twod_patch);
+  sycl_device.deallocate(gpu_data_threed_patch);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 0f3dc5787..b9d4c5584 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -11,9 +11,10 @@
 
 #include <Eigen/CXX11/Tensor>
 
+template<typename Scalar>
 static void test_default()
 {
-  Tensor<float, 1> vec(6);
+  Tensor<Scalar, 1> vec(6);
   vec.setRandom();
 
   // Fixme: we should check that the generated numbers follow a uniform
@@ -23,10 +24,11 @@ static void test_default()
   }
 }
 
+template<typename Scalar>
 static void test_normal()
 {
-  Tensor<float, 1> vec(6);
-  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+  Tensor<Scalar, 1> vec(6);
+  vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
 
   // Fixme: we should check that the generated numbers follow a gaussian
   // distribution instead.
@@ -70,9 +72,15 @@ static void test_custom()
   }
 }
 
-void test_cxx11_tensor_random()
+EIGEN_DECLARE_TEST(cxx11_tensor_random)
 {
-  CALL_SUBTEST(test_default());
-  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST((test_default<float>()));
+  CALL_SUBTEST((test_normal<float>()));
+  CALL_SUBTEST((test_default<double>()));
+  CALL_SUBTEST((test_normal<double>()));
+  CALL_SUBTEST((test_default<Eigen::half>()));
+  CALL_SUBTEST((test_normal<Eigen::half>()));
+  CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+  CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
   CALL_SUBTEST(test_custom());
 }
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
index b3be199e1..090986ebc 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@@ -9,18 +9,16 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
 
-void test_cuda_random_uniform()
+void test_gpu_random_uniform()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -28,24 +26,24 @@ void test_cuda_random_uniform()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
   gpu_out.device(gpu_device) = gpu_out.random();
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  // For now we just check thes code doesn't crash.
+  // For now we just check this code doesn't crash.
   // TODO: come up with a valid test of randomness
 }
 
 
-void test_cuda_random_normal()
+void test_gpu_random_normal()
 {
   Tensor<float, 2> out(72,97);
   out.setZero();
@@ -53,9 +51,9 @@ void test_cuda_random_normal()
   std::size_t out_bytes = out.size() * sizeof(float);
 
   float* d_out;
-  cudaMalloc((void**)(&d_out), out_bytes);
+  gpuMalloc((void**)(&d_out), out_bytes);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@@ -63,8 +61,8 @@ void test_cuda_random_normal()
   Eigen::internal::NormalRandomGenerator<float> gen(true);
   gpu_out.device(gpu_device) = gpu_out.random(gen);
 
-  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
-  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+  assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 }
 
 static void test_complex()
@@ -80,9 +78,9 @@ static void test_complex()
 }
 
 
-void test_cxx11_tensor_random_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
 {
-  CALL_SUBTEST(test_cuda_random_uniform());
-  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_gpu_random_uniform());
+  CALL_SUBTEST(test_gpu_random_normal());
   CALL_SUBTEST(test_complex());
 }
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 000000000..6c83894a3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+  gpu_out.device(sycl_device)=gpu_out.random();
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+  Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+  out.setZero();
+  std::size_t out_bytes = out.size() * sizeof(DataType);
+
+  IndexType sizeDim0 = 72;
+  IndexType sizeDim1 = 97;
+
+  array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+  DataType* d_out  = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+  Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+  gpu_out.device(sycl_device)=gpu_out.random(gen);
+  sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+  for(IndexType i=1; i<sizeDim0; i++)
+    for(IndexType j=1; j<sizeDim1; j++)
+    {
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+      VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+    }
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+  sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 1490ec3da..c46c4c91d 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -53,20 +53,22 @@ static void test_trivial_reductions() {
   }
 }
 
-template <int DataLayout>
+template <typename Scalar,int DataLayout>
 static void test_simple_reductions() {
-  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7);
   tensor.setRandom();
+  // Add a little offset so that the product reductions won't be close to zero.
+  tensor += tensor.constant(Scalar(0.5f));
   array<ptrdiff_t, 2> reduction_axis2;
   reduction_axis2[0] = 1;
   reduction_axis2[1] = 3;
 
-  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2);
   VERIFY_IS_EQUAL(result.dimension(0), 2);
   VERIFY_IS_EQUAL(result.dimension(1), 5);
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 5; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       for (int k = 0; k < 3; ++k) {
         for (int l = 0; l < 7; ++l) {
           sum += tensor(i, k, j, l);
@@ -77,7 +79,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum();
     VERIFY_IS_EQUAL(sum1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -85,7 +87,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
     VERIFY_IS_EQUAL(sum2.rank(), 0);
 
     VERIFY_IS_APPROX(sum1(), sum2());
@@ -98,7 +100,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float prod = 1.0f;
+      Scalar prod = Scalar(1.0f);
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           prod *= tensor(k, i, l, j);
@@ -109,7 +111,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod();
     VERIFY_IS_EQUAL(prod1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -117,7 +119,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
     VERIFY_IS_EQUAL(prod2.rank(), 0);
 
     VERIFY_IS_APPROX(prod1(), prod2());
@@ -130,7 +132,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float max_val = std::numeric_limits<float>::lowest();
+      Scalar max_val = std::numeric_limits<Scalar>::lowest();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 5; ++l) {
           max_val = (std::max)(max_val, tensor(k, i, l, j));
@@ -141,7 +143,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum();
     VERIFY_IS_EQUAL(max1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -149,7 +151,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
     VERIFY_IS_EQUAL(max2.rank(), 0);
 
     VERIFY_IS_APPROX(max1(), max2());
@@ -162,7 +164,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float min_val = (std::numeric_limits<float>::max)();
+      Scalar min_val = (std::numeric_limits<Scalar>::max)();
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
           min_val = (std::min)(min_val, tensor(k, l, i, j));
@@ -173,7 +175,7 @@ static void test_simple_reductions() {
   }
 
   {
-    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum();
     VERIFY_IS_EQUAL(min1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -181,7 +183,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
     VERIFY_IS_EQUAL(min2.rank(), 0);
 
     VERIFY_IS_APPROX(min1(), min2());
@@ -194,7 +196,7 @@ static void test_simple_reductions() {
   VERIFY_IS_EQUAL(result.dimension(1), 7);
   for (int i = 0; i < 5; ++i) {
     for (int j = 0; j < 7; ++j) {
-      float sum = 0.0f;
+      Scalar sum = Scalar(0.0f);
       int count = 0;
       for (int k = 0; k < 2; ++k) {
         for (int l = 0; l < 3; ++l) {
@@ -202,12 +204,12 @@ static void test_simple_reductions() {
           ++count;
         }
       }
-      VERIFY_IS_APPROX(result(i, j), sum / count);
+      VERIFY_IS_APPROX(result(i, j), sum / Scalar(count));
     }
   }
 
   {
-    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean();
     VERIFY_IS_EQUAL(mean1.rank(), 0);
 
     array<ptrdiff_t, 4> reduction_axis4;
@@ -215,7 +217,7 @@ static void test_simple_reductions() {
     reduction_axis4[1] = 1;
     reduction_axis4[2] = 2;
     reduction_axis4[3] = 3;
-    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
     VERIFY_IS_EQUAL(mean2.rank(), 0);
 
     VERIFY_IS_APPROX(mean1(), mean2());
@@ -225,11 +227,11 @@ static void test_simple_reductions() {
     Tensor<int, 1> ints(10);
     std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
 
-    TensorFixedSize<bool, Sizes<> > all;
-    all = ints.all();
-    VERIFY(!all());
-    all = (ints >= ints.constant(0)).all();
-    VERIFY(all());
+    TensorFixedSize<bool, Sizes<> > all_;
+    all_ = ints.all();
+    VERIFY(!all_());
+    all_ = (ints >= ints.constant(0)).all();
+    VERIFY(all_());
 
     TensorFixedSize<bool, Sizes<> > any;
     any = (ints > ints.constant(10)).any();
@@ -368,7 +370,7 @@ static void test_static_dims() {
   Tensor<float, 2, DataLayout> out(72, 97);
   in.setRandom();
 
-#if !EIGEN_HAS_CONSTEXPR 
+#if !EIGEN_HAS_CONSTEXPR
   array<int, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
@@ -386,7 +388,7 @@ static void test_static_dims() {
           expected = (std::max)(expected, in(i, k, j, l));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -417,7 +419,7 @@ static void test_innermost_last_dims() {
           expected = (std::max)(expected, in(l, k, i, j));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -448,7 +450,7 @@ static void test_innermost_first_dims() {
           expected = (std::max)(expected, in(i, j, k, l));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
     }
   }
 }
@@ -479,16 +481,37 @@ static void test_reduce_middle_dims() {
           expected = (std::max)(expected, in(i, k, l, j));
         }
       }
-      VERIFY_IS_APPROX(out(i, j), expected);
+      VERIFY_IS_EQUAL(out(i, j), expected);
+    }
+  }
+}
+
+static void test_sum_accuracy() {
+  Tensor<float, 3> tensor(101, 101, 101);
+  for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
+    tensor.setRandom();
+    tensor += tensor.constant(prescribed_mean);
+
+    Tensor<float, 0> sum = tensor.sum();
+    double expected_sum = 0.0;
+    for (int i = 0; i < 101; ++i) {
+      for (int j = 0; j < 101; ++j) {
+        for (int k = 0; k < 101; ++k) {
+          expected_sum += static_cast<double>(tensor(i, j, k));
+        }
+      }
     }
+    VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
   }
 }
 
-void test_cxx11_tensor_reduction() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
   CALL_SUBTEST(test_trivial_reductions<ColMajor>());
   CALL_SUBTEST(test_trivial_reductions<RowMajor>());
-  CALL_SUBTEST(test_simple_reductions<ColMajor>());
-  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() ));
+  CALL_SUBTEST(( test_simple_reductions<Eigen::bfloat16,ColMajor>() ));
   CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
   CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
@@ -505,4 +528,5 @@ void test_cxx11_tensor_reduction() {
   CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
   CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+  CALL_SUBTEST(test_sum_accuracy());
 }
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
index 6858b43a7..122ac946b 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu
@@ -9,12 +9,9 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -22,7 +19,7 @@
 template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   const int num_rows = internal::random<int>(1024, 5*1024);
@@ -70,7 +67,7 @@ static void test_first_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data(T)
@@ -110,7 +107,7 @@ static void test_last_dim_reductions() {
   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
 
   // Create device
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice dev(&stream);
   
   // Create data
@@ -137,7 +134,7 @@ static void test_last_dim_reductions() {
 }
 
 
-void test_cxx11_tensor_reduction_cuda() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index a9ef82907..a297716e4 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -13,38 +13,168 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
 
 #include "main.h"
+
 #include <unsupported/Eigen/CXX11/Tensor>
 
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 753;
+  const IndexType num_cols = 537;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
 
+  array<IndexType, 2> outRange = {{1, 1}};
 
-static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
 
-  const int num_rows = 452;
-  const int num_cols = 765;
-  array<int, 2> tensorRange = {{num_rows, num_cols}};
+  in.setRandom();
+  auto dim = DSizes<IndexType, 2>(1, 1);
+  full_redux = in.sum().reshape(dim);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+      sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+                                                                outRange);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+  sycl_device.memcpyDeviceToHost(
+      full_redux_gpu.data(), gpu_out_data,
+      (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+            << ", CPU FULL: " << full_redux(0, 0) << "\n";
+  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
 
-  Tensor<float, 2> in(tensorRange);
-  Tensor<float, 0> full_redux;
-  Tensor<float, 0> full_redux_gpu;
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
 
   in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.sum();
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  full_redux = in.sum();
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
-  TensorMap<Tensor<float, 2> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 0> >  out_gpu(gpu_out_data);
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.maximum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the max.
+  // As we don't include this in the reduction the result should not be 2.
+  in(0) = static_cast<DataType>(2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.maximum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
-  out_gpu.device(sycl_device) = in_gpu.sum();
-  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
 
@@ -52,87 +182,833 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
   sycl_device.deallocate(gpu_out_data);
 }
 
-static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 4096;
+  const IndexType num_cols = 4096;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+  array<IndexType, 1> argRange = {{num_cols}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  //  red_axis[1]=1;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+  in_arg1.setRandom();
+  in_arg2.setRandom();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+      in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+  bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+      out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+      gpu_in_arg1_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+      gpu_in_arg2_data, tensorRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+      gpu_out_arg_data, argRange);
+  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+      gpu_out_arg__gpu_helper_data, argRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  // CPU VERSION
+  out_arg_cpu =
+      (in_arg1.argmax(1) == in_arg2.argmax(1))
+          .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+  full_redux = (out_arg_cpu.template cast<float>())
+                   .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+  // GPU VERSION
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg1_data, in_arg1.data(),
+      (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_arg2_data, in_arg2.data(),
+      (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+  out_Argout_gpu_helper.device(sycl_device) =
+      (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+  out_Argout_gpu.device(sycl_device) =
+      (out_Argout_gpu_helper)
+          .select(out_Argout_gpu.constant(true),
+                  out_Argout_gpu.constant(false));
+  out_gpu.device(sycl_device) =
+      (out_Argout_gpu.template cast<float>())
+          .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+            << '\n';
+  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_in_arg1_data);
+  sycl_device.deallocate(gpu_in_arg2_data);
+  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+  sycl_device.deallocate(gpu_out_arg_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.mean();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_odd_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  // This is a particular case which illustrates a possible problem when the
+  // number of local threads in a workgroup is even, but is not a power of two.
+  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+  const IndexType n_elems = 8707;
+  array<IndexType, 1> tensor_range = {{n_elems}};
+
+  data_tensor in(tensor_range);
+  DataType full_redux;
+  DataType full_redux_gpu;
+  TensorMap<scalar_tensor> red_cpu(&full_redux);
+  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+  const DataType const_val = static_cast<DataType>(0.6391);
+  in = in.constant(const_val);
+
+  Eigen::IndexList<Eigen::type2index<0>> red_axis;
+  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  VERIFY_IS_APPROX(const_val, red_cpu());
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) =
+      in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  const IndexType num_rows = 876;
+  const IndexType num_cols = 953;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.minimum();
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+
+  data_tensor in(tensor_range);
+  scalar_tensor full_redux;
+  scalar_tensor full_redux_gpu;
+
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set the initial value to be the min.
+  // As we don't include this in the reduction the result should not be -2.
+  in(0) = static_cast<DataType>(-2);
+
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  full_redux = in_offset.minimum();
+  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data =
+      static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.minimum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+                                 sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.maximum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> reduced_range = {{num_cols}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = num_cols;
 
-  int dim_x = 145;
-  int dim_y = 1;
-  int dim_z = 67;
+  data_tensor in(tensor_range);
+  reduced_tensor redux;
+  reduced_tensor redux_gpu(reduced_range);
 
-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  in.setRandom();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
-  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
 
-  Tensor<float, 3> in(tensorRange);
-  Tensor<float, 2> redux(reduced_tensorRange);
-  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+  const IndexType offset = 64;
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  redux = in_offset.maximum(red_axis);
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+                                 n_reduced * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+  const IndexType num_rows = 64;
+  const IndexType num_cols = 64;
+  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+  array<IndexType, 1> full_reduced_range = {{num_rows}};
+  array<IndexType, 1> reduced_range = {{num_rows - 1}};
+  const IndexType n_elems = internal::array_prod(tensor_range);
+  const IndexType n_reduced = reduced_range[0];
+
+  data_tensor in(tensor_range);
+  reduced_tensor redux(full_reduced_range);
+  reduced_tensor redux_gpu(reduced_range);
 
   in.setRandom();
+  redux.setZero();
+  array<IndexType, 2> tensor_offset_range(tensor_range);
+  tensor_offset_range[0] -= 1;
+  // Set maximum value outside of the considered range.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    in(i) = static_cast<DataType>(2);
+  }
+
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 1;
+
+  const IndexType offset = 64;
+  // Introduce an offset in both the input and the output.
+  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+  red_offset = in_offset.maximum(red_axis);
+
+  // Check that the first value hasn't been changed and that the reduced values
+  // are not equal to the previously set maximum in the input outside the range.
+  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+  }
+
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(
+      sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
+
+  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+                                 n_elems * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+                                 n_reduced * sizeof(DataType));
 
-  redux= in.sum(red_axis);
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < n_reduced; i++) {
+    VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
 
-  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  in.setRandom();
+  redux = in.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
 
   // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType i = 0; i < redux.size(); i++) {
+    VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+  }
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 0;
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }
 
-static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
+  red_axis[0] = 2;
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux = in.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  int dim_x = 567;
-  int dim_y = 1;
-  int dim_z = 47;
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  IndexType dim_x = 64;
+  IndexType dim_y = 1;
+  IndexType dim_z = 32;
+
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 2;
-  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
 
-  Tensor<float, 3> in(tensorRange);
-  Tensor<float, 2> redux(reduced_tensorRange);
-  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
 
   in.setRandom();
 
-  redux= in.sum(red_axis);
+  redux = in.sum(red_axis);
 
-  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
-  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
 
-  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+                                                               tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+      gpu_out_data, reduced_tensorRange);
 
-  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
-  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu.data(), gpu_out_data,
+      redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
-      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+    for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+      VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
 
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  // auto red_axis =  Sizes<0,1>(0,1);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+
+  redux_fix = in_fix.sum(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
 
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  auto tensorRange = Sizes<64, 32>(64, 32);
+  Eigen::IndexList<Eigen::type2index<1>> red_axis;
+  auto reduced_tensorRange = Sizes<64>(64);
+  TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+  TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+  in_fix.setRandom();
+  redux_fix = in_fix.mean(red_axis);
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, in_fix.data(),
+      (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+  sycl_device.memcpyDeviceToHost(
+      redux_gpu_fix.data(), gpu_out_data,
+      redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+  sycl_device.synchronize();
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+    VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
+  static constexpr OutT InfBit = 1;
+  static constexpr OutT NanBit = 2;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+                                                    OutT* accum) const {
+    if (Eigen::numext::isinf(x))
+      *accum |= InfBit;
+    else if (Eigen::numext::isnan(x))
+      *accum |= NanBit;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+                                                    OutT* accum) const {
+    *accum |= x;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+    return OutT(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+    return accum;
+  }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+          typename IndexType>
+static void test_full_reductions_custom_sycl(
+    const Eigen::SyclDevice& sycl_device) {
+  constexpr IndexType InSize = 64;
+  auto tensorRange = Sizes<InSize>(InSize);
+  Eigen::IndexList<Eigen::type2index<0>> dims;
+  auto reduced_tensorRange = Sizes<>();
+  TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+  TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+  CustomReducer<DataType, AccumType> reducer;
+
+  in_fix.setRandom();
+
+  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+  DataType* gpu_in_data =
+      static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+  AccumType* gpu_out_data =
+      static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+      gpu_in_data, tensorRange);
+  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+      gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+                                 sizeof(AccumType));
+  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
 }
 
-void test_cxx11_tensor_reduction_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
-  CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
-  CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+      sycl_device);
+  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  4197, 4097);
+  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+                                                                  129, 8);
+  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
+  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+      sycl_device);
+  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  sycl_device.synchronize();
+}
 
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index c8f105e3d..7dbd0478c 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -235,7 +235,7 @@ static void test_nested_ops_with_ref()
 }
 
 
-void test_cxx11_tensor_ref()
+EIGEN_DECLARE_TEST(cxx11_tensor_ref)
 {
   CALL_SUBTEST(test_simple_lvalue_ref());
   CALL_SUBTEST(test_simple_rvalue_ref());
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index b35b8d29e..5e44ec007 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue)
 }
 
 
-void test_cxx11_tensor_reverse()
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse)
 {
   CALL_SUBTEST(test_simple_reverse<ColMajor>());
   CALL_SUBTEST(test_simple_reverse<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
new file mode 100644
index 000000000..dd30c235d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+                                                                 tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+  // Check that the CPU and GPU reductions return the same result.
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(i, 2 - j, 4 - k, l));
+        }
+      }
+    }
+  }
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+  sycl_device.memcpyDeviceToHost(
+      reversed_tensor.data(), gpu_out_data,
+      reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l),
+                          reversed_tensor(1 - i, j, k, 6 - l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+                              bool LValue) {
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;
+
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  DataType* gpu_in_data = static_cast<DataType*>(
+      sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+      expected.dimensions().TotalSize() * sizeof(DataType)));
+  DataType* gpu_out_data_result = static_cast<DataType*>(
+      sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+                                                                tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+      gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+      gpu_out_data_result, tensorRange);
+
+  sycl_device.memcpyHostToDevice(
+      gpu_in_data, tensor.data(),
+      (tensor.dimensions().TotalSize()) * sizeof(DataType));
+
+  if (LValue) {
+    out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+  } else {
+    out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+  }
+  sycl_device.memcpyDeviceToHost(
+      expected.data(), gpu_out_data_expected,
+      expected.dimensions().TotalSize() * sizeof(DataType));
+
+  array<IndexType, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<IndexType, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<IndexType, 4> dst_slice_dim = src_slice_dim;
+  array<IndexType, 4> dst_slice_start = src_slice_start;
+
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  sycl_device.memcpyHostToDevice(
+      gpu_out_data_result, result.data(),
+      (result.dimensions().TotalSize()) * sizeof(DataType));
+  for (IndexType i = 0; i < 5; ++i) {
+    if (LValue) {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+          .reverse(dim_rev)
+          .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+    } else {
+      out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+          in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+    }
+    dst_slice_start[2] += 1;
+  }
+  sycl_device.memcpyDeviceToHost(
+      result.data(), gpu_out_data_result,
+      result.dimensions().TotalSize() * sizeof(DataType));
+
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
+  QueueInterface queueInterface(d);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.get_info<cl::sycl::info::device::name>() << std::endl;
+    CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+    CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+    CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
index 2c26151ab..83b592384 100644
--- a/unsupported/test/cxx11_tensor_roundings.cpp
+++ b/unsupported/test/cxx11_tensor_roundings.cpp
@@ -54,7 +54,7 @@ static void test_float_ceiling()
   }
 }
 
-void test_cxx11_tensor_roundings()
+EIGEN_DECLARE_TEST(cxx11_tensor_roundings)
 {
    CALL_SUBTEST(test_float_rounding());
    CALL_SUBTEST(test_float_ceiling());
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index af59aa3ef..dccee9e84 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -98,7 +98,7 @@ static void test_tensor_maps() {
   }
 }
 
-void test_cxx11_tensor_scan() {
+EIGEN_DECLARE_TEST(cxx11_tensor_scan) {
   CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
   CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
   CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
index 5f146f3c9..770a144f1 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@@ -9,21 +9,20 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
 template<int DataLayout>
-void test_cuda_cumsum(int m_size, int k_size, int n_size)
+void test_gpu_cumsum(int m_size, int k_size, int n_size)
 {
   std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
   Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@@ -38,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   float* d_t_input;
   float* d_t_result;
 
-  cudaMalloc((void**)(&d_t_input), t_input_bytes);
-  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+  gpuMalloc((void**)(&d_t_input), t_input_bytes);
+  gpuMalloc((void**)(&d_t_result), t_result_bytes);
 
-  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+  gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
 
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@@ -54,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
   gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
   t_result = t_input.cumsum(1);
 
-  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
   for (DenseIndex i = 0; i < t_result.size(); i++) {
     if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
       continue;
@@ -67,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
     assert(false);
   }
 
-  cudaFree((void*)d_t_input);
-  cudaFree((void*)d_t_result);
+  gpuFree((void*)d_t_input);
+  gpuFree((void*)d_t_result);
 }
 
 
-void test_cxx11_tensor_scan_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
 {
-  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
-  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
 }
diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 000000000..09c45fce5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+                      IndexType k_size, IndexType n_size, int consume_dim,
+                      bool exclusive) {
+  static const DataType error_threshold = 1e-4f;
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+            << " consume_dim : " << consume_dim << ")" << std::endl;
+  Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+  Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+                                                          n_size);
+
+  t_input.setRandom();
+  std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+  DataType* gpu_data_in =
+      static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+  DataType* gpu_data_out =
+      static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+  array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+      gpu_data_in, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+      gpu_data_out, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+  sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+  gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+  t_result = t_input.cumsum(consume_dim, exclusive);
+
+  sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+                                 t_result_bytes);
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+            t_result(i) - t_result_gpu(i)))) < error_threshold) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+                                  error_threshold)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+              << " vs SYCL : " << t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+  sycl_device.deallocate(gpu_data_in);
+  sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+                                                false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+  test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+  test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+                                                false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    std::cout << "Running on "
+              << device.template get_info<cl::sycl::info::device::name>()
+              << std::endl;
+    QueueInterface queueInterface(device);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+    CALL_SUBTEST_1(
+        sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_2(
+        sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_3(
+        sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+    CALL_SUBTEST_4(
+        sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+    CALL_SUBTEST_5(
+        sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+    CALL_SUBTEST_6(
+        sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index d11444a14..89a64c021 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
   Tensor<float, 4, DataLayout> expected;
   expected = tensor.shuffle(shuffles);
 
-  Tensor<float, 4, DataLayout> result(5,7,3,2);
+  Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
 
-  array<int, 4> src_slice_dim{{2,3,1,7}};
-  array<int, 4> src_slice_start{{0,0,0,0}};
-  array<int, 4> dst_slice_dim{{1,7,3,2}};
-  array<int, 4> dst_slice_start{{0,0,0,0}};
+  array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}};
+  array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}};
+  array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}};
+  array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}};
 
   for (int i = 0; i < 5; ++i) {
     result.slice(dst_slice_start, dst_slice_dim) =
@@ -215,7 +215,60 @@ static void test_shuffle_unshuffle()
 }
 
 
-void test_cxx11_tensor_shuffling()
+template <int DataLayout>
+static void test_empty_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,0,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 0);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
 {
   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
   CALL_SUBTEST(test_simple_shuffling<RowMajor>());
@@ -225,4 +278,6 @@ void test_cxx11_tensor_shuffling()
   CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+  CALL_SUBTEST(test_empty_shuffling<ColMajor>());
+  CALL_SUBTEST(test_empty_shuffling<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
new file mode 100644
index 000000000..ca4e8b5ef
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
+  tensor.setRandom();
+
+  const size_t buffSize = tensor.size() * sizeof(DataType);
+  array<IndexType, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+                                                             tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+                                                             tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
+
+  gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  array<IndexType, 4> tensorrangeShuffle = {
+      {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+  DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+      gpu_data3, tensorrangeShuffle);
+
+  gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
+  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
+  sycl_device.synchronize();
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
+          VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
+        }
+      }
+    }
+  }
+}
+
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+  for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 5a0d339ef..6d70f5435 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -316,7 +316,7 @@ static void test_resize()
   VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
 }
 
-void test_cxx11_tensor_simple()
+EIGEN_DECLARE_TEST(cxx11_tensor_simple)
 {
   CALL_SUBTEST(test_0d());
   CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
index 935b908cc..aefdfa9b4 100644
--- a/unsupported/test/cxx11_tensor_striding.cpp
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -110,7 +110,7 @@ static void test_striding_as_lvalue()
 }
 
 
-void test_cxx11_tensor_striding()
+EIGEN_DECLARE_TEST(cxx11_tensor_striding)
 {
   CALL_SUBTEST(test_simple_striding<ColMajor>());
   CALL_SUBTEST(test_simple_striding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
new file mode 100644
index 000000000..d3b1fa77c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  //no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+//  stride = tensor.stride(strides);
+
+  gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (IndexType i = 0; i < 1; ++i) {
+    for (IndexType j = 0; j < 1; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        for (IndexType l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+  Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+  Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+  Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+  std::size_t tensor_bytes = tensor.size()  * sizeof(DataType);
+  std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+  std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+  DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+  DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+  DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+  //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<IndexType, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+//  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+//  result.stride(strides) = tensor;
+  sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+  gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<IndexType, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+//  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+//  result2.stride(strides) = tensor.stride(no_strides);
+
+  gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(d_tensor);
+  sycl_device.deallocate(d_no_stride);
+  sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+  QueueInterface queueInterface(s);
+  auto sycl_device=Eigen::SyclDevice(&queueInterface);
+  test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+  test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorStridingPerDevice(device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index 2f56eb495..2ca5c47db 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() {
   }
 }
 
-void test_cxx11_tensor_sugar()
+EIGEN_DECLARE_TEST(cxx11_tensor_sugar)
 {
   CALL_SUBTEST(test_comparison_sugar());
   CALL_SUBTEST(test_scalar_sugar_add_mul());
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 6a9c33422..e6c5e2378 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -15,8 +15,8 @@
 
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL
 
 #include "main.h"
@@ -27,36 +27,188 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
 
-void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
+  IndexType sizeDim1 = 5;
+  IndexType sizeDim2 = 5;
+  IndexType sizeDim3 = 1;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
+
+  in1 = in1.random();
+
+  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
+  gpu1.device(sycl_device) = gpu1 * 3.14f;
+  gpu2.device(sycl_device) = gpu2 * 2.7f;
+  sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+  //  std::cout << "SYCL DATA : " << out1(i) << "  vs  CPU DATA : " << in1(i) * 3.14f << "\n";
+    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
+    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
+  }
+
+  sycl_device.deallocate(gpu_data1);
+  sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
+  IndexType size = 20;
+  array<IndexType, 1> tensorRange = {{size}};
+  Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
+
+  in1 = in1.random();
+  in2 = in1;
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  in1.setZero();
+
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < in1.size(); ++i) {
+    VERIFY_IS_APPROX(out(i), in2(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type in1(tensorRange);
+  tensor_type out(tensorRange);
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  in1 = in1.random();
+  // Copy all data to device, then permute on copy back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  // Permute copies to device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  in1 = in1.random();
+  out.setZero();
+  DataType* gpu_data_out  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+  // Copy all to device, permute copies on device, then copy all back to host
+  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < half_size; ++i) {
+    VERIFY_IS_APPROX(out(i), in1(i + half_size));
+    VERIFY_IS_APPROX(out(i + half_size), in1(i));
+  }
+
+  sycl_device.deallocate(gpu_data_out);
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+  IndexType full_size = 32;
+  IndexType half_size = full_size / 2;
+  array<IndexType, 1> tensorRange = {{full_size}};
+  tensor_type cpu_out(tensorRange);
+  tensor_type out(tensorRange);
+
+  cpu_out.setZero();
+
+  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+  for (IndexType i = 0; i < full_size; ++i) {
+    VERIFY_IS_APPROX(out(i), cpu_out(i));
+  }
+
+  sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
 
-  int sizeDim1 = 100;
-  int sizeDim2 = 100;
-  int sizeDim3 = 100;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<float, 3> in1(tensorRange);
-  Tensor<float, 3> in2(tensorRange);
-  Tensor<float, 3> in3(tensorRange);
-  Tensor<float, 3> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
 
   in2 = in2.random();
   in3 = in3.random();
 
-  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_in3_data  = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
-  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+  DataType * gpu_in3_data  = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
+  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
 
-  TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
-  TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
 
   /// a=1.2f
   gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
-  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
       }
     }
@@ -65,10 +217,12 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// a=b*1.2f
   gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) * 1.2f);
       }
@@ -77,12 +231,14 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   printf("a=b*1.2f Test Passed\n");
 
   /// c=a*b
-  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
   gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) *
                              in2(i,j,k));
@@ -93,10 +249,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// c=a+b
   gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) +
                              in2(i,j,k));
@@ -107,10 +264,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   /// c=a*a
   gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) *
                              in1(i,j,k));
@@ -121,10 +279,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
 
   //a*3.14f + b*2.7f
   gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
-  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i,j,k),
                          in1(i,j,k) * 3.14f
                        + in2(i,j,k) * 2.7f);
@@ -134,12 +293,13 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   printf("a*3.14f + b*2.7f Test Passed\n");
 
   ///d= (a>0.5? b:c)
-  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
   gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
-  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+  sycl_device.synchronize();
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
         VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
                                                 ? in2(i, j, k)
                                                 : in3(i, j, k));
@@ -152,8 +312,50 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
   sycl_device.deallocate(gpu_in3_data);
   sycl_device.deallocate(gpu_out_data);
 }
-void test_cxx11_tensor_sycl() {
-  cl::sycl::gpu_selector s;
-  Eigen::SyclDevice sycl_device(s);
-  CALL_SUBTEST(test_sycl_cpu(sycl_device));
+template<typename Scalar1, typename Scalar2,  int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+    IndexType size = 20;
+    array<IndexType, 1> tensorRange = {{size}};
+    Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+    in = in.random();
+
+    Scalar1* gpu_in_data  = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+    Scalar2 * gpu_out_data =  static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+    TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+    TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+    sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+    gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+    sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+    out_host = in. template cast<Scalar2>();
+    for(IndexType i=0; i< size; i++)
+    {
+      VERIFY_IS_APPROX(out(i), out_host(i));
+    }
+    printf("cast Test Passed\n");
+    sycl_device.deallocate(gpu_in_data);
+    sycl_device.deallocate(gpu_out_data);
+}
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+  QueueInterface queueInterface(s);
+  auto sycl_device = Eigen::SyclDevice(&queueInterface);
+  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
+  for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+  }
 }
diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
index d680e9b3b..fed269a9a 100644
--- a/unsupported/test/cxx11_tensor_symmetry.cpp
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -801,7 +801,7 @@ static void test_tensor_randacc()
   }
 }
 
-void test_cxx11_tensor_symmetry()
+EIGEN_DECLARE_TEST(cxx11_tensor_symmetry)
 {
   CALL_SUBTEST(test_symgroups_static());
   CALL_SUBTEST(test_symgroups_dynamic());
diff --git a/unsupported/test/cxx11_tensor_thread_local.cpp b/unsupported/test/cxx11_tensor_thread_local.cpp
new file mode 100644
index 000000000..7e866f6d1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_local.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <iostream>
+#include <unordered_set>
+
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+struct Counter {
+  Counter() = default;
+
+  void inc() {
+    // Check that mutation happens only in a thread that created this counter.
+    VERIFY_IS_EQUAL(std::this_thread::get_id(), created_by);
+    counter_value++;
+  }
+  int value() { return counter_value; }
+
+  std::thread::id created_by;
+  int counter_value = 0;
+};
+
+struct InitCounter {
+  void operator()(Counter& counter) {
+    counter.created_by = std::this_thread::get_id();
+  }
+};
+
+void test_simple_thread_local() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 3 * num_threads;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  counter.ForEach(
+      [](std::thread::id, Counter& cnt) { VERIFY_IS_EQUAL(cnt.value(), 3); });
+}
+
+void test_zero_sized_thread_local() {
+  Eigen::ThreadLocal<Counter, InitCounter> counter(0, InitCounter());
+
+  Counter& local = counter.local();
+  local.inc();
+
+  int total = 0;
+  counter.ForEach([&total](std::thread::id, Counter& cnt) {
+    total += cnt.value();
+    VERIFY_IS_EQUAL(cnt.value(), 1);
+  });
+
+  VERIFY_IS_EQUAL(total, 1);
+}
+
+// All thread local values fits into the lock-free storage.
+void test_large_number_of_tasks_no_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+// Lock free thread local storage is too small to fit all the unique threads,
+// and it spills to a map guarded by a mutex.
+void test_large_number_of_tasks_with_spill() {
+  int num_threads = internal::random<int>(4, 32);
+  Eigen::ThreadPool thread_pool(num_threads);
+  Eigen::ThreadLocal<Counter, InitCounter> counter(1, InitCounter());
+
+  int num_tasks = 10000;
+  Eigen::Barrier barrier(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    thread_pool.Schedule([&counter, &barrier]() {
+      Counter& local = counter.local();
+      local.inc();
+      barrier.Notify();
+    });
+  }
+
+  barrier.Wait();
+
+  int total = 0;
+  std::unordered_set<std::thread::id> unique_threads;
+
+  counter.ForEach([&](std::thread::id id, Counter& cnt) {
+    total += cnt.value();
+    unique_threads.insert(id);
+  });
+
+  VERIFY_IS_EQUAL(total, num_tasks);
+  // Not all threads in a pool might be woken up to execute submitted tasks.
+  // Also thread_pool.Schedule() might use current thread if queue is full.
+  VERIFY_IS_EQUAL(
+      unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_local) {
+  CALL_SUBTEST(test_simple_thread_local());
+  CALL_SUBTEST(test_zero_sized_thread_local());
+  CALL_SUBTEST(test_large_number_of_tasks_no_spill());
+  CALL_SUBTEST(test_large_number_of_tasks_with_spill());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2ef665f30..b772a1d60 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -16,29 +16,72 @@
 
 using Eigen::Tensor;
 
+class TestAllocator : public Allocator {
+ public:
+  ~TestAllocator() EIGEN_OVERRIDE {}
+  EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->alloc_count_++;
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
+    const_cast<TestAllocator*>(this)->dealloc_count_++;
+    internal::aligned_free(buffer);
+  }
+
+  int alloc_count() const { return alloc_count_; }
+  int dealloc_count() const { return dealloc_count_; }
+
+ private:
+  int alloc_count_ = 0;
+  int dealloc_count_ = 0;
+};
 
 void test_multithread_elementwise()
 {
-  Tensor<float, 3> in1(2,3,7);
-  Tensor<float, 3> in2(2,3,7);
-  Tensor<float, 3> out(2,3,7);
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
 
   in1.setRandom();
   in2.setRandom();
 
   Eigen::ThreadPool tp(internal::random<int>(3, 11));
   Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
-  out.device(thread_pool_device) = in1 + in2 * 3.14f;
+  out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
 
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 7; ++k) {
-        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
       }
     }
   }
 }
 
+void test_async_multithread_elementwise()
+{
+  Tensor<float, 3> in1(200, 30, 70);
+  Tensor<float, 3> in2(200, 30, 70);
+  Tensor<double, 3> out(200, 30, 70);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+  Eigen::Barrier b(1);
+  out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
+  b.Wait();
+
+  for (int i = 0; i < 200; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+      }
+    }
+  }
+}
 
 void test_multithread_compound_assignment()
 {
@@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() {
   }
 }
 
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+      const TensorContractionParams&, Index, Index, Index num_rows,
+      Index num_cols) const {
+    for (int i = 0; i < num_rows; ++i) {
+      for (int j = 0; j < num_cols; ++j) {
+        output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+      }
+    }
+  }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in mat4 to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+  int contract_size = internal::random<int>(100, 500);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+                                    contract_size,
+                                    internal::random<int>(10, 40));
+
+  Tensor<float, 4, DataLayout> right(
+      internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+      internal::random<int>(1, 20));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+  Eigen::Barrier barrier(1);
+  tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+      left.contract(right, dims);
+  barrier.Wait();
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test
+    // will fail due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims);
+  barrier.Wait();
+
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+
+  const int num_threads = internal::random<int>(4, 16);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 2, DataLayout> t_left(2, 10000);
+  Tensor<float, 2, DataLayout> t_right(10000, 10);
+  Tensor<float, 2, DataLayout> t_result(2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+  // Put trash in t_result to verify contraction clears output memory.
+  t_result.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 2, 10000);
+  MapXf m_right(t_right.data(), 10000, 10);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+  // compute results by separate methods
+  Eigen::Barrier barrier(1);
+  t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+      t_left.contract(t_right, dims, SqrtOutputKernel());
+  barrier.Wait();
+  m_result = m_left * m_right;
+
+  for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+  }
+}
 
 template<int DataLayout>
 void test_full_contraction() {
@@ -320,14 +630,14 @@ void test_multithread_random()
 }
 
 template<int DataLayout>
-void test_multithread_shuffle()
+void test_multithread_shuffle(Allocator* allocator)
 {
   Tensor<float, 4, DataLayout> tensor(17,5,7,11);
   tensor.setRandom();
 
   const int num_threads = internal::random<int>(2, 11);
   ThreadPool threads(num_threads);
-  Eigen::ThreadPoolDevice device(&threads, num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
 
   Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
   array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
@@ -344,10 +654,26 @@ void test_multithread_shuffle()
   }
 }
 
+void test_threadpool_allocate(TestAllocator* allocator)
+{
+  const int num_threads = internal::random<int>(2, 11);
+  const int num_allocs = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+  for (int a = 0; a < num_allocs; ++a) {
+    void* ptr = device.allocate(512);
+    device.deallocate(ptr);
+  }
+  VERIFY(allocator != NULL);
+  VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
+  VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
+}
 
-void test_cxx11_tensor_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
 {
   CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_async_multithread_elementwise());
   CALL_SUBTEST_1(test_multithread_compound_assignment());
 
   CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
@@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool()
 
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
   CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Test EvalShardedByInnerDimContext parallelization strategy.
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+  CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
 
   // Exercise various cases that have been problematic in the past.
-  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
-  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_8(test_full_contraction<RowMajor>());
+
+  CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
 
-  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
-  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+  CALL_SUBTEST_10(test_memcpy());
+  CALL_SUBTEST_10(test_multithread_random());
 
-  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
-  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+  TestAllocator test_allocator;
+  CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+  CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+  CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
 
-  CALL_SUBTEST_6(test_memcpy());
-  CALL_SUBTEST_6(test_multithread_random());
-  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
-  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+  // Force CMake to split this test.
+  // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
 }
diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
new file mode 100644
index 000000000..009722895
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_trace.cpp
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_0D_trace() {
+  Tensor<float, 0, DataLayout> tensor;
+  tensor.setRandom();
+  array<ptrdiff_t, 0> dims;
+  Tensor<float, 0, DataLayout> result = tensor.trace(dims);
+  VERIFY_IS_EQUAL(result(), tensor());
+}
+
+
+template <int DataLayout>
+static void test_all_dimensions_trace() {
+  Tensor<float, 3, DataLayout> tensor1(5, 5, 5);
+  tensor1.setRandom();
+  Tensor<float, 0, DataLayout> result1 = tensor1.trace();
+  VERIFY_IS_EQUAL(result1.rank(), 0);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum += tensor1(i, i, i);
+  }
+  VERIFY_IS_EQUAL(result1(), sum);
+
+  Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 5> dims = { { 2, 1, 0, 3, 4 } };
+  Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims);
+  VERIFY_IS_EQUAL(result2.rank(), 0);
+  sum = 0.0f;
+  for (int i = 0; i < 7; ++i) {
+    sum += tensor2(i, i, i, i, i);
+  }
+  VERIFY_IS_EQUAL(result2(), sum);
+}
+
+
+template <int DataLayout>
+static void test_simple_trace() {
+  Tensor<float, 3, DataLayout> tensor1(3, 5, 3);
+  tensor1.setRandom();
+  array<ptrdiff_t, 2> dims1 = { { 0, 2 } };
+  Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1);
+  VERIFY_IS_EQUAL(result1.rank(), 1);
+  VERIFY_IS_EQUAL(result1.dimension(0), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 5; ++i) {
+    sum = 0.0f;
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor1(j, i, j);
+    }
+    VERIFY_IS_EQUAL(result1(i), sum);
+  }
+
+  Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7);
+  tensor2.setRandom();
+  array<ptrdiff_t, 2> dims2 = { { 2, 3 } };
+  Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2);
+  VERIFY_IS_EQUAL(result2.rank(), 2);
+  VERIFY_IS_EQUAL(result2.dimension(0), 5);
+  VERIFY_IS_EQUAL(result2.dimension(1), 5);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 7; ++k) {
+        sum += tensor2(i, j, k, k);
+      }
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+    }
+  }
+
+  array<ptrdiff_t, 2> dims3 = { { 1, 0 } };
+  Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3);
+  VERIFY_IS_EQUAL(result3.rank(), 2);
+  VERIFY_IS_EQUAL(result3.dimension(0), 7);
+  VERIFY_IS_EQUAL(result3.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        sum += tensor2(k, k, i, j);
+      }
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3);
+  tensor3.setRandom();
+  array<ptrdiff_t, 3> dims4 = { { 0, 2, 4 } };
+  Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4);
+  VERIFY_IS_EQUAL(result4.rank(), 2);
+  VERIFY_IS_EQUAL(result4.dimension(0), 7);
+  VERIFY_IS_EQUAL(result4.dimension(1), 7);
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor3(k, i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result4(i, j), sum);
+    }
+  }
+
+  Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5);
+  tensor4.setRandom();
+  array<ptrdiff_t, 2> dims5 = { { 1, 3 } };
+  Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5);
+  VERIFY_IS_EQUAL(result5.rank(), 3);
+  VERIFY_IS_EQUAL(result5.dimension(0), 3);
+  VERIFY_IS_EQUAL(result5.dimension(1), 4);
+  VERIFY_IS_EQUAL(result5.dimension(2), 5);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        sum = 0.0f;
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor4(i, l, j, l, k);
+        }
+        VERIFY_IS_EQUAL(result5(i, j, k), sum);
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_trace_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> dims = { { 1, 3 } };
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.trace(dims);
+  VERIFY_IS_EQUAL(result.rank(), 2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        sum += tensor(i, k, j, k);
+      }
+      VERIFY_IS_EQUAL(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_trace) {
+  CALL_SUBTEST(test_0D_trace<ColMajor>());
+  CALL_SUBTEST(test_0D_trace<RowMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
+  CALL_SUBTEST(test_all_dimensions_trace<RowMajor>());
+  CALL_SUBTEST(test_simple_trace<ColMajor>());
+  CALL_SUBTEST(test_simple_trace<RowMajor>());
+  CALL_SUBTEST(test_trace_in_expr<ColMajor>());
+  CALL_SUBTEST(test_trace_in_expr<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index d2a1e8673..46fceaa19 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -12,7 +12,7 @@
 #include <Eigen/CXX11/Tensor>
 
 
-#if EIGEN_COMP_MSVC
+#if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__)
 #define EIGEN_NO_INT128
 #else
 typedef __uint128_t uint128_t;
@@ -144,7 +144,7 @@ void test_misc2() {
 #endif
 
 
-void test_cxx11_tensor_uint128()
+EIGEN_DECLARE_TEST(cxx11_tensor_uint128)
 {
 #ifdef EIGEN_NO_INT128
   // Skip the test on compilers that don't support 128bit integers natively
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
index ca6840f3b..862212e82 100644
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -70,9 +70,9 @@ static void test_entire_volume_patch()
   const int dy = patch_y - 1;
   const int dx = patch_x - 1;
 
-  const int forward_pad_z = dz - dz / 2;
-  const int forward_pad_y = dy - dy / 2;
-  const int forward_pad_x = dx - dx / 2;
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
 
   for (int pz = 0; pz < patch_z; pz++) {
     for (int py = 0; py < patch_y; py++) {
@@ -105,7 +105,7 @@ static void test_entire_volume_patch()
   }
 }
 
-void test_cxx11_tensor_volume_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch)
 {
   CALL_SUBTEST(test_single_voxel_patch());
   CALL_SUBTEST(test_entire_volume_patch());
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
new file mode 100644
index 000000000..8d99a48ed
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+IndexType sizeDim0 = 4;
+IndexType sizeDim1 = 2;
+IndexType sizeDim2 = 3;
+IndexType sizeDim3 = 5;
+IndexType sizeDim4 = 7;
+array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}};
+Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+tensor_col_major.setRandom();
+
+
+  DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+  DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+  TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+  TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+  gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+
+
+  // single volume patch: ColMajor
+  array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}};
+  Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange);
+  size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange);
+  gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}};
+  Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_single_voxel_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange);
+  gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1);
+  sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize);
+
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+       VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+  }
+
+
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_col_major);
+  sycl_device.deallocate(gpu_data_single_voxel_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}};
+  array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+  Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+  tensor_col_major.setRandom();
+
+
+    DataType* gpu_data_col_major  = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+    DataType* gpu_data_row_major  = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+    TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+    TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+    sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+    gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+    sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+
+    // single volume patch: ColMajor
+    array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}};
+    Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange);
+    size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType);
+    DataType* gpu_data_entire_volume_patch_col_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+    TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange);
+    gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x);
+    sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize);
+
+
+//  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+//  tensor.setRandom();
+//  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  //Tensor<float, 6> entire_volume_patch;
+  //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch);
+
+//  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+
+  array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}};
+  Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange);
+  patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType);
+  DataType* gpu_data_entire_volume_patch_row_major  = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+  TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange);
+  gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize);
+
+
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz / 2;
+  const int forward_pad_y = dy / 2;
+  const int forward_pad_x = dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor_col_major(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  sycl_device.deallocate(gpu_data_col_major);
+  sycl_device.deallocate(gpu_data_row_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_col_major);
+  sycl_device.deallocate(gpu_data_entire_volume_patch_row_major);
+}
+
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl;
+test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
+test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp
index 2b11807c8..5f63161b2 100644
--- a/unsupported/test/dgmres.cpp
+++ b/unsupported/test/dgmres.cpp
@@ -9,7 +9,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "../../test/sparse_solver.h"
-#include <Eigen/src/IterativeSolvers/DGMRES.h>
+#include <unsupported/Eigen/IterativeSolvers>
 
 template<typename T> void test_dgmres_T()
 {
@@ -24,7 +24,7 @@ template<typename T> void test_dgmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor)     );
 }
 
-void test_dgmres()
+EIGEN_DECLARE_TEST(dgmres)
 {
   CALL_SUBTEST_1(test_dgmres_T<double>());
   CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index 866db8e86..14a909d3b 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp
@@ -35,7 +35,7 @@ struct TestFunc1
   int m_inputs, m_values;
 
   TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
-  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
@@ -119,7 +119,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
     VERIFY_IS_APPROX(j, jref);
 }
 
-void test_forward_adolc()
+EIGEN_DECLARE_TEST(forward_adolc)
 {
   adtl::setNumDir(NUMBER_DIRECTIONS);
 
@@ -132,7 +132,7 @@ void test_forward_adolc()
   }
 
   {
-    // simple instanciation tests
+    // simple instantiation tests
     Matrix<adtl::adouble,2,1> x;
     foo(x);
     Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp
index f2969116b..8d2254b5b 100644
--- a/unsupported/test/gmres.cpp
+++ b/unsupported/test/gmres.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_gmres_T()
   //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor)     );
 }
 
-void test_gmres()
+EIGEN_DECLARE_TEST(gmres)
 {
   CALL_SUBTEST_1(test_gmres_T<double>());
   CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
diff --git a/unsupported/test/idrs.cpp b/unsupported/test/idrs.cpp
new file mode 100644
index 000000000..f88c01632
--- /dev/null
+++ b/unsupported/test/idrs.cpp
@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_idrs_T()
+{
+  IDRS<SparseMatrix<T>, DiagonalPreconditioner<T> > idrs_colmajor_diag;
+  IDRS<SparseMatrix<T>, IncompleteLUT<T> >           idrs_colmajor_ilut;
+
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_diag)  );
+  CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_ilut)     );
+}
+
+EIGEN_DECLARE_TEST(idrs)
+{
+  CALL_SUBTEST_1(test_idrs_T<double>());
+  CALL_SUBTEST_2(test_idrs_T<std::complex<double> >());
+}
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index e770049e5..b5b764c65 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,6 +9,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+
 #ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
@@ -83,7 +84,7 @@ void check_sparse_kronecker_product(const MatrixType& ab)
 }
 
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   // DM = dense matrix; SM = sparse matrix
 
@@ -95,7 +96,7 @@ void test_kronecker_product()
   SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
   SM_a.insert(1,1) = DM_a.coeffRef(1,1) =  0.6469156566545853;
   SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
- 
+
   MatrixXd             DM_b(3,2);
   SparseMatrix<double> SM_b(3,2);
   SM_b.insert(0,0) = DM_b.coeffRef(0,0) =  0.9004440976767099;
@@ -165,7 +166,7 @@ void test_kronecker_product()
   SM_a.insert(0,3) = -0.2;
   SM_a.insert(2,4) =  0.3;
   SM_a.finalize();
-  
+
   SM_b.insert(0,0) =  0.4;
   SM_b.insert(2,1) = -0.5;
   SM_b.finalize();
@@ -183,7 +184,7 @@ void test_kronecker_product()
   DM_b2.resize(4,8);
   DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
   CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
-  
+
   for(int i = 0; i < g_repeat; i++)
   {
     double density = Eigen::internal::random<double>(0.01,0.5);
@@ -196,35 +197,35 @@ void test_kronecker_product()
     MatrixXf dA(ra,ca), dB(rb,cb), dC;
     initSparse(density, dA, sA);
     initSparse(density, dB, sB);
-    
+
     sC = kroneckerProduct(sA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA.transpose(),sB);
     dC = kroneckerProduct(dA.transpose(),dB);
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA.transpose(),sB.transpose());
     dC = kroneckerProduct(dA.transpose(),dB.transpose());
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC = kroneckerProduct(sA,sB.transpose());
     dC = kroneckerProduct(dA,dB.transpose());
     VERIFY_IS_APPROX(MatrixXf(sC),dC);
-    
+
     sC2 = kroneckerProduct(sA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(dA,sB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(sA,dB);
     dC = kroneckerProduct(dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-    
+
     sC2 = kroneckerProduct(2*sA,sB);
     dC = kroneckerProduct(2*dA,dB);
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
@@ -236,11 +237,10 @@ void test_kronecker_product()
 #ifdef EIGEN_TEST_PART_2
 
 // simply check that for a dense kronecker product, sparse module is not needed
-
 #include "main.h"
 #include <Eigen/KroneckerProduct>
 
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
 {
   MatrixXd a(2,2), b(3,3), c;
   a.setRandom();
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 64f168c16..7f9a81cd3 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -1445,7 +1445,7 @@ void testNistEckerle4(void)
   VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
 }
 
-void test_levenberg_marquardt()
+EIGEN_DECLARE_TEST(levenberg_marquardt)
 {
     // Tests using the examples provided by (c)minpack
     CALL_SUBTEST(testLmder1());
diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp
index 50dec083d..b032cbf1d 100644
--- a/unsupported/test/matrix_exponential.cpp
+++ b/unsupported/test/matrix_exponential.cpp
@@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol)
   }
 }
 
-void test_matrix_exponential()
+EIGEN_DECLARE_TEST(matrix_exponential)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
   CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 7c9b68a3c..6d753737d 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -23,9 +23,8 @@ inline bool test_isApprox_abs(const Type1& a, const Type2& b)
 
 // Returns a matrix with eigenvalues clustered around 0, 1 and 2.
 template<typename MatrixType>
-MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size)
+MatrixType randomMatrixWithRealEivals(const Index size)
 {
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   MatrixType diag = MatrixType::Zero(size, size);
@@ -42,16 +41,15 @@ template <typename MatrixType, int IsComplex = NumTraits<typename internal::trai
 struct randomMatrixWithImagEivals
 {
   // Returns a matrix with eigenvalues clustered around 0 and +/- i.
-  static MatrixType run(const typename MatrixType::Index size);
+  static MatrixType run(const Index size);
 };
 
 // Partial specialization for real matrices
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 0>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
   {
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::Scalar Scalar;
     MatrixType diag = MatrixType::Zero(size, size);
     Index i = 0;
@@ -77,9 +75,8 @@ struct randomMatrixWithImagEivals<MatrixType, 0>
 template<typename MatrixType>
 struct randomMatrixWithImagEivals<MatrixType, 1>
 {
-  static MatrixType run(const typename MatrixType::Index size)
+  static MatrixType run(const Index size)
   {
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     const Scalar imagUnit(0, 1);
@@ -171,7 +168,6 @@ void testMatrixType(const MatrixType& m)
 {
   // Matrices with clustered eigenvalue lead to different code paths
   // in MatrixFunction.h and are thus useful for testing.
-  typedef typename MatrixType::Index Index;
 
   const Index size = m.rows();
   for (int i = 0; i < g_repeat; i++) {
@@ -181,7 +177,40 @@ void testMatrixType(const MatrixType& m)
   }
 }
 
-void test_matrix_function()
+template<typename MatrixType>
+void testMapRef(const MatrixType& A)
+{
+  // Test if passing Ref and Map objects is possible
+  // (Regression test for Bug #1796)
+  Index size = A.rows();
+  MatrixType X; X.setRandom(size, size);
+  MatrixType Y(size,size);
+  Ref<      MatrixType> R(Y);
+  Ref<const MatrixType> Rc(X);
+  Map<      MatrixType> M(Y.data(), size, size);
+  Map<const MatrixType> Mc(X.data(), size, size);
+
+  X = X*X; // make sure sqrt is possible
+  Y = X.sqrt();
+  R = Rc.sqrt();
+  M = Mc.sqrt();
+  Y = X.exp();
+  R = Rc.exp();
+  M = Mc.exp();
+  X = Y; // make sure log is possible
+  Y = X.log();
+  R = Rc.log();
+  M = Mc.log();
+
+  Y = X.cos() + Rc.cos() + Mc.cos();
+  Y = X.sin() + Rc.sin() + Mc.sin();
+
+  Y = X.cosh() + Rc.cosh() + Mc.cosh();
+  Y = X.sinh() + Rc.sinh() + Mc.sinh();
+}
+
+
+EIGEN_DECLARE_TEST(matrix_function)
 {
   CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
   CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
@@ -190,4 +219,9 @@ void test_matrix_function()
   CALL_SUBTEST_5(testMatrixType(Matrix<double,5,5,RowMajor>()));
   CALL_SUBTEST_6(testMatrixType(Matrix4cd()));
   CALL_SUBTEST_7(testMatrixType(MatrixXd(13,13)));
+
+  CALL_SUBTEST_1(testMapRef(Matrix<float,1,1>()));
+  CALL_SUBTEST_2(testMapRef(Matrix3cf()));
+  CALL_SUBTEST_3(testMapRef(MatrixXf(8,8)));
+  CALL_SUBTEST_7(testMapRef(MatrixXd(13,13)));
 }
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index 7ccfacfdf..dbaf9dbdf 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -19,7 +19,7 @@ void test2dRotation(const T& tol)
   MatrixPower<Matrix<T,2,2> > Apow(A);
 
   for (int i=0; i<=20; ++i) {
-    angle = std::pow(T(10), (i-10) / T(5.));
+    angle = std::pow(T(10), T(i-10) / T(5.));
     c = std::cos(angle);
     s = std::sin(angle);
     B << c, s, -s, c;
@@ -61,7 +61,7 @@ void test3dRotation(const T& tol)
   for (int i=0; i<=20; ++i) {
     v = Matrix<T,3,1>::Random();
     v.normalize();
-    angle = std::pow(T(10), (i-10) / T(5.));
+    angle = std::pow(T(10), T(i-10) / T(5.));
     VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
   }
 }
@@ -150,55 +150,55 @@ typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
 typedef Matrix<long double,3,3>             Matrix3e;
 typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
  
-void test_matrix_power()
+EIGEN_DECLARE_TEST(matrix_power)
 {
   CALL_SUBTEST_2(test2dRotation<double>(1e-13));
-  CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5f));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
   CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
   CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
-  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f));
   CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
 
   CALL_SUBTEST_10(test3dRotation<double>(1e-13));
-  CALL_SUBTEST_11(test3dRotation<float>(1e-5));
+  CALL_SUBTEST_11(test3dRotation<float>(1e-5f));
   CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
 
   CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testGeneral(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testGeneral(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3)); // see bug 614
+  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3f)); // see bug 614
   CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testSingular(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testSingular(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3f));
   CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
 
   CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
   CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
   CALL_SUBTEST_3(testLogThenExp(Matrix4cd(),        1e-13));
   CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8),      2e-12));
-  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4));
-  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4));
-  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4));
-  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4f));
+  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4f));
+  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4f));
+  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3f));
   CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
   CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
-  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4));
+  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
 }
diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp
index ea541e1ea..034f29217 100644
--- a/unsupported/test/matrix_square_root.cpp
+++ b/unsupported/test/matrix_square_root.cpp
@@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m)
   VERIFY_IS_APPROX(sqrtA * sqrtA, A);
 }
 
-void test_matrix_square_root()
+EIGEN_DECLARE_TEST(matrix_square_root)
 {
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 8b300b78a..2eb40fef6 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_minres_T()
 
 }
 
-void test_minres()
+EIGEN_DECLARE_TEST(minres)
 {
   CALL_SUBTEST_1(test_minres_T<double>());
 //  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
deleted file mode 100644
index 8404f1ff8..000000000
--- a/unsupported/test/mpreal/mpreal.h
+++ /dev/null
@@ -1,3104 +0,0 @@
-/*
-    MPFR C++: Multi-precision floating point number class for C++.
-    Based on MPFR library:    http://mpfr.org
-
-    Project homepage:    http://www.holoborodko.com/pavel/mpfr
-    Contact e-mail:      pavel@holoborodko.com
-
-    Copyright (c) 2008-2015 Pavel Holoborodko
-
-    Contributors:
-    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
-    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
-    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
-    Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
-    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
-    Rodney James, Jorge Leitao.
-
-    Licensing:
-    (A) MPFR C++ is under GNU General Public License ("GPL").
-
-    (B) Non-free licenses may also be purchased from the author, for users who
-        do not want their programs protected by the GPL.
-
-        The non-free licenses are for users that wish to use MPFR C++ in
-        their products but are unwilling to release their software
-        under the GPL (which would require them to release source code
-        and allow free redistribution).
-
-        Such users can purchase an unlimited-use license from the author.
-        Contact us for more details.
-
-    GNU General Public License ("GPL") copyright permissions statement:
-    **************************************************************************
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MPREAL_H__
-#define __MPREAL_H__
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <complex>
-#include <algorithm>
-
-// Options
-#define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
-#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
-                                                // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
-                                                // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
-
-// Library version
-#define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 6
-#define MPREAL_VERSION_PATCHLEVEL 2
-#define MPREAL_VERSION_STRING "3.6.2"
-
-// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__)
-    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
-#elif defined(_MSC_VER)                         // Microsoft Visual C++
-    #define IsInf(x) (!_finite(x))
-#else
-    #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
-#endif
-
-// A Clang feature extension to determine compiler features.
-#ifndef __has_feature
-    #define __has_feature(x) 0
-#endif
-
-// Detect support for r-value references (move semantic). Borrowed from Eigen.
-#if (__has_feature(cxx_rvalue_references) || \
-       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
-      (defined(_MSC_VER) && _MSC_VER >= 1600))
-
-    #define MPREAL_HAVE_MOVE_SUPPORT
-
-    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
-    #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
-    #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
-#endif
-
-// Detect support for explicit converters.
-#if (__has_feature(cxx_explicit_conversions) || \
-       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
-       (defined(_MSC_VER) && _MSC_VER >= 1800))
-
-    #define MPREAL_HAVE_EXPLICIT_CONVERTERS
-#endif
-
-#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
-
-#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
-    #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
-    #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
-#else
-    #define MPREAL_MSVC_DEBUGVIEW_CODE
-    #define MPREAL_MSVC_DEBUGVIEW_DATA
-#endif
-
-#include <mpfr.h>
-
-#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
-    #include <cstdlib>                          // Needed for random()
-#endif
-
-// Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal
-                                                // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
-                                                // = -1 disables overflow checks (default)
-
-// Fast replacement for mpfr_set_zero(x, +1):
-// (a) uses low-level data members, might not be compatible with new versions of MPFR
-// (b) sign is not set, add (x)->_mpfr_sign = 1;
-#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
-
-#if defined(__GNUC__)
-  #define MPREAL_PERMISSIVE_EXPR __extension__
-#else
-  #define MPREAL_PERMISSIVE_EXPR
-#endif
-
-namespace mpfr {
-
-class mpreal {
-private:
-    mpfr_t mp;
-
-public:
-
-    // Get default rounding mode & precision
-    inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
-    inline static mp_prec_t  get_default_prec()   {    return mpfr_get_default_prec();                            }
-
-    // Constructors && type conversions
-    mpreal();
-    mpreal(const mpreal& u);
-    mpreal(const mpf_t u);
-    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-
-    // Construct mpreal from mpfr_t structure.
-    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
-    mpreal(const mpfr_t  u, bool shared = false);
-
-    mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-
-    ~mpreal();
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    mpreal& operator=(mpreal&& v);
-    mpreal(mpreal&& u);
-#endif
-
-    // Operations
-    // =
-    // +, -, *, /, ++, --, <<, >>
-    // *=, +=, -=, /=,
-    // <, >, ==, <=, >=
-
-    // =
-    mpreal& operator=(const mpreal& v);
-    mpreal& operator=(const mpf_t v);
-    mpreal& operator=(const mpz_t v);
-    mpreal& operator=(const mpq_t v);
-    mpreal& operator=(const long double v);
-    mpreal& operator=(const double v);
-    mpreal& operator=(const unsigned long int v);
-    mpreal& operator=(const unsigned long long int v);
-    mpreal& operator=(const long long int v);
-    mpreal& operator=(const unsigned int v);
-    mpreal& operator=(const long int v);
-    mpreal& operator=(const int v);
-    mpreal& operator=(const char* s);
-    mpreal& operator=(const std::string& s);
-    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
-
-    // +
-    mpreal& operator+=(const mpreal& v);
-    mpreal& operator+=(const mpf_t v);
-    mpreal& operator+=(const mpz_t v);
-    mpreal& operator+=(const mpq_t v);
-    mpreal& operator+=(const long double u);
-    mpreal& operator+=(const double u);
-    mpreal& operator+=(const unsigned long int u);
-    mpreal& operator+=(const unsigned int u);
-    mpreal& operator+=(const long int u);
-    mpreal& operator+=(const int u);
-
-    mpreal& operator+=(const long long int  u);
-    mpreal& operator+=(const unsigned long long int u);
-    mpreal& operator-=(const long long int  u);
-    mpreal& operator-=(const unsigned long long int u);
-    mpreal& operator*=(const long long int  u);
-    mpreal& operator*=(const unsigned long long int u);
-    mpreal& operator/=(const long long int  u);
-    mpreal& operator/=(const unsigned long long int u);
-
-    const mpreal operator+() const;
-    mpreal& operator++ ();
-    const mpreal  operator++ (int);
-
-    // -
-    mpreal& operator-=(const mpreal& v);
-    mpreal& operator-=(const mpz_t v);
-    mpreal& operator-=(const mpq_t v);
-    mpreal& operator-=(const long double u);
-    mpreal& operator-=(const double u);
-    mpreal& operator-=(const unsigned long int u);
-    mpreal& operator-=(const unsigned int u);
-    mpreal& operator-=(const long int u);
-    mpreal& operator-=(const int u);
-    const mpreal operator-() const;
-    friend const mpreal operator-(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator-(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator-(const long int b,          const mpreal& a);
-    friend const mpreal operator-(const int b,               const mpreal& a);
-    friend const mpreal operator-(const double b,            const mpreal& a);
-    mpreal& operator-- ();
-    const mpreal  operator-- (int);
-
-    // *
-    mpreal& operator*=(const mpreal& v);
-    mpreal& operator*=(const mpz_t v);
-    mpreal& operator*=(const mpq_t v);
-    mpreal& operator*=(const long double v);
-    mpreal& operator*=(const double v);
-    mpreal& operator*=(const unsigned long int v);
-    mpreal& operator*=(const unsigned int v);
-    mpreal& operator*=(const long int v);
-    mpreal& operator*=(const int v);
-
-    // /
-    mpreal& operator/=(const mpreal& v);
-    mpreal& operator/=(const mpz_t v);
-    mpreal& operator/=(const mpq_t v);
-    mpreal& operator/=(const long double v);
-    mpreal& operator/=(const double v);
-    mpreal& operator/=(const unsigned long int v);
-    mpreal& operator/=(const unsigned int v);
-    mpreal& operator/=(const long int v);
-    mpreal& operator/=(const int v);
-    friend const mpreal operator/(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator/(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator/(const long int b,          const mpreal& a);
-    friend const mpreal operator/(const int b,               const mpreal& a);
-    friend const mpreal operator/(const double b,            const mpreal& a);
-
-    //<<= Fast Multiplication by 2^u
-    mpreal& operator<<=(const unsigned long int u);
-    mpreal& operator<<=(const unsigned int u);
-    mpreal& operator<<=(const long int u);
-    mpreal& operator<<=(const int u);
-
-    //>>= Fast Division by 2^u
-    mpreal& operator>>=(const unsigned long int u);
-    mpreal& operator>>=(const unsigned int u);
-    mpreal& operator>>=(const long int u);
-    mpreal& operator>>=(const int u);
-
-    // Type Conversion operators
-    bool               toBool      (                        )    const;
-    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
-    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
-    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
-    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
-
-#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator bool               () const { return toBool();                 }
-    explicit operator int                () const { return int(toLong());            }
-    explicit operator long               () const { return toLong();                 }
-    explicit operator long long          () const { return toLLong();                }
-    explicit operator unsigned           () const { return unsigned(toULong());      }
-    explicit operator unsigned long      () const { return toULong();                }
-    explicit operator unsigned long long () const { return toULLong();               }
-    explicit operator float              () const { return toFloat();                }
-    explicit operator double             () const { return toDouble();               }
-    explicit operator long double        () const { return toLDouble();              }
-#endif
-
-    // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
-    ::mpfr_ptr    mpfr_ptr();
-    ::mpfr_srcptr mpfr_ptr()    const;
-    ::mpfr_srcptr mpfr_srcptr() const;
-
-    // Convert mpreal to string with n significant digits in base b
-    // n = -1 -> convert with the maximum available digits
-    std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    std::string toString(const std::string& format) const;
-#endif
-
-    std::ostream& output(std::ostream& os) const;
-
-    // Math Functions
-    friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
-    friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend int cmpabs(const mpreal& a,const mpreal& b);
-
-    friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
-    friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal acos  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asin  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
-    friend const mpreal acot  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asec  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsc  (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cosh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sinh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tanh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sech  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csch  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal coth  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    friend const mpreal fac_ui (unsigned long int v,  mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
-    friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
-    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
-    friend int sgn(const mpreal& v); // returns -1 or +1
-
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    friend int          sinh_cosh   (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal li2         (const mpreal& v,                       mp_rnd_t rnd_mode);
-    friend const mpreal fmod        (const mpreal& x, const mpreal& y,      mp_rnd_t rnd_mode);
-    friend const mpreal rec_sqrt    (const mpreal& v,                       mp_rnd_t rnd_mode);
-
-    // MATLAB's semantic equivalents
-    friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
-    friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal grandom (unsigned int seed);
-#endif
-
-    // Uniformly distributed random number generation in [0,1] using
-    // Mersenne-Twister algorithm by default.
-    // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
-    // Check urandom() for more precise control.
-    friend const mpreal random(unsigned int seed);
-
-    // Splits mpreal value into fractional and integer parts.
-    // Returns fractional part and stores integer part in n.
-    friend const mpreal modf(const mpreal& v, mpreal& n);
-
-    // Constants
-    // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
-    friend const mpreal const_log2      (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_pi        (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_euler     (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_catalan   (mp_prec_t prec, mp_rnd_t rnd_mode);
-
-    // returns +inf iff sign>=0 otherwise -inf
-    friend const mpreal const_infinity(int sign, mp_prec_t prec);
-
-    // Output/ Input
-    friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
-    friend std::istream& operator>>(std::istream& is, mpreal& v);
-
-    // Integer Related Functions
-    friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal ceil (const mpreal& v);
-    friend const mpreal floor(const mpreal& v);
-    friend const mpreal round(const mpreal& v);
-    friend const mpreal trunc(const mpreal& v);
-    friend const mpreal rint_ceil   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_floor  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_round  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_trunc  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    // Miscellaneous Functions
-    friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
-    friend const mpreal nextabove  (const mpreal& x);
-    friend const mpreal nextbelow  (const mpreal& x);
-
-    // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal urandomb (gmp_randstate_t& state);
-
-// MPFR < 2.4.2 Specifics
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-    friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
-#endif
-
-    // Instance Checkers
-    friend bool (isnan)    (const mpreal& v);
-    friend bool (isinf)    (const mpreal& v);
-    friend bool (isfinite) (const mpreal& v);
-
-    friend bool isnum    (const mpreal& v);
-    friend bool iszero   (const mpreal& v);
-    friend bool isint    (const mpreal& v);
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend bool isregular(const mpreal& v);
-#endif
-
-    // Set/Get instance properties
-    inline mp_prec_t    get_prec() const;
-    inline void         set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd());    // Change precision with rounding mode
-
-    // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
-    inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
-    inline int          getPrecision() const;
-
-    // Set mpreal to +/- inf, NaN, +/-0
-    mpreal&        setInf  (int Sign = +1);
-    mpreal&        setNan  ();
-    mpreal&        setZero (int Sign = +1);
-    mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
-
-    //Exponent
-    mp_exp_t get_exp();
-    int set_exp(mp_exp_t e);
-    int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
-    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
-
-    // Inexact conversion from float
-    inline bool fits_in_bits(double x, int n);
-
-    // Set/Get global properties
-    static void            set_default_prec(mp_prec_t prec);
-    static void            set_default_rnd(mp_rnd_t rnd_mode);
-
-    static mp_exp_t  get_emin (void);
-    static mp_exp_t  get_emax (void);
-    static mp_exp_t  get_emin_min (void);
-    static mp_exp_t  get_emin_max (void);
-    static mp_exp_t  get_emax_min (void);
-    static mp_exp_t  get_emax_max (void);
-    static int       set_emin (mp_exp_t exp);
-    static int       set_emax (mp_exp_t exp);
-
-    // Efficient swapping of two mpreal values - needed for std algorithms
-    friend void swap(mpreal& x, mpreal& y);
-
-    friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-private:
-    // Human friendly Debug Preview in Visual Studio.
-    // Put one of these lines:
-    //
-    // mpfr::mpreal=<DebugView>                              ; Show value only
-    // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
-    //
-    // at the beginning of
-    // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
-    MPREAL_MSVC_DEBUGVIEW_DATA
-
-    // "Smart" resources deallocation. Checks if instance initialized before deletion.
-    void clear(::mpfr_ptr);
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Exceptions
-class conversion_overflow : public std::exception {
-public:
-    std::string why() { return "inexact conversion from floating point"; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Constructors & converters
-// Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal()
-{
-    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
-    mpfr_set_zero_fast(mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpreal& u)
-{
-    mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
-    mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-inline mpreal::mpreal(mpreal&& other)
-{
-    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pointer to actual data
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal& mpreal::operator=(mpreal&& other)
-{
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-#endif
-
-inline mpreal::mpreal(const mpfr_t  u, bool shared)
-{
-    if(shared)
-    {
-        std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
-    }
-    else
-    {
-        mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
-        mpfr_set  (mpfr_ptr(), u, mpreal::get_default_rnd());
-    }
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpf_t u)
-{
-    mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
-    mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_z(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_q(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
-{
-     mpfr_init2(mpfr_ptr(), prec);
-
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-  if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
-  {
-    mpfr_set_d(mpfr_ptr(), u, mode);
-  }else
-    throw conversion_overflow();
-#else
-  mpfr_set_d(mpfr_ptr(), u, mode);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ld(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_uj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_sj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s, base, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline void mpreal::clear(::mpfr_ptr x)
-{
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    if(mpfr_is_initialized(x))
-#endif
-    mpfr_clear(x);
-}
-
-inline mpreal::~mpreal()
-{
-    clear(mpfr_ptr());
-}
-
-// internal namespace needed for template magic
-namespace internal{
-
-    // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
-    // This is needed for smooth integration with libraries based on expression templates, like Eigen.
-    // TODO: Do the same for boolean operators.
-    template <typename ArgumentType> struct result_type {};
-
-    template <> struct result_type<mpreal>              {typedef mpreal type;};
-    template <> struct result_type<mpz_t>               {typedef mpreal type;};
-    template <> struct result_type<mpq_t>               {typedef mpreal type;};
-    template <> struct result_type<long double>         {typedef mpreal type;};
-    template <> struct result_type<double>              {typedef mpreal type;};
-    template <> struct result_type<unsigned long int>   {typedef mpreal type;};
-    template <> struct result_type<unsigned int>        {typedef mpreal type;};
-    template <> struct result_type<long int>            {typedef mpreal type;};
-    template <> struct result_type<int>                 {typedef mpreal type;};
-    template <> struct result_type<long long>           {typedef mpreal type;};
-    template <> struct result_type<unsigned long long>  {typedef mpreal type;};
-}
-
-// + Addition
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    }
-
-// - Subtraction
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
-
-// * Multiplication
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    }
-
-// / Division
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
-    operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
-    operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
-
-//////////////////////////////////////////////////////////////////////////
-// sqrt
-const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-// abs
-inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// pow
-const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// Estimate machine epsilon for the given precision
-// Returns smallest eps such that 1.0 + eps != 1.0
-inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
-
-// Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);
-
-// Gives max & min values for the required precision,
-// minval is 'safe' meaning 1 / minval does not overflow
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
-inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
-
-// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
-
-// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
-
-// 'Bitwise' equality check
-//  maxUlps - a and b can be apart by maxUlps binary numbers.
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
-
-//////////////////////////////////////////////////////////////////////////
-// Convert precision in 'bits' to decimal digits and vice versa.
-//    bits   = ceil(digits*log[2](10))
-//    digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d);
-inline int       bits2digits(mp_prec_t b);
-
-//////////////////////////////////////////////////////////////////////////
-// min, max
-const mpreal (max)(const mpreal& x, const mpreal& y);
-const mpreal (min)(const mpreal& x, const mpreal& y);
-
-//////////////////////////////////////////////////////////////////////////
-// Implementation
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-// Operators - Assignment
-inline mpreal& mpreal::operator=(const mpreal& v)
-{
-    if (this != &v)
-    {
-    mp_prec_t tp = mpfr_get_prec(  mpfr_srcptr());
-    mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
-
-    if(tp != vp){
-      clear(mpfr_ptr());
-      mpfr_init2(mpfr_ptr(), vp);
-    }
-
-        mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpf_t v)
-{
-    mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpz_t v)
-{
-    mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpq_t v)
-{
-    mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long double v)
-{
-    mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const double v)
-{
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-  if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
-  {
-    mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-  }else
-    throw conversion_overflow();
-#else
-  mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-#endif
-
-  MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long int v)
-{
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned int v)
-{
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long long int v)
-{
-    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long long int v)
-{
-    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long int v)
-{
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const int v)
-{
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const char* s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const std::string& s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-template <typename real_t>
-inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
-{
-    return *this = z.real();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// + Addition
-inline mpreal& mpreal::operator+=(const mpreal& v)
-{
-    mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpf_t u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpz_t u)
-{
-    mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpq_t u)
-{
-    mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const long double u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const double u)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-#else
-    *this += mpreal(u);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned long int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-
-inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
-
-inline const mpreal operator+(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline mpreal& mpreal::operator++()
-{
-    return *this += 1;
-}
-
-inline const mpreal mpreal::operator++ (int)
-{
-    mpreal x(*this);
-    *this += 1;
-    return x;
-}
-
-inline mpreal& mpreal::operator--()
-{
-    return *this -= 1;
-}
-
-inline const mpreal mpreal::operator-- (int)
-{
-    mpreal x(*this);
-    *this -= 1;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// - Subtraction
-inline mpreal& mpreal::operator-=(const mpreal& v)
-{
-    mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpz_t v)
-{
-    mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpq_t v)
-{
-    mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long double v)
-{
-    *this -= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this -= mpreal(v);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned long int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal mpreal::operator-()const
-{
-    mpreal u(*this);
-    mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-    return u;
-}
-
-inline const mpreal operator-(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline const mpreal operator-(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
-    x -= a;
-    return x;
-#endif
-}
-
-inline const mpreal operator-(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// * Multiplication
-inline mpreal& mpreal::operator*= (const mpreal& v)
-{
-    mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpz_t v)
-{
-    mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpq_t v)
-{
-    mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long double v)
-{
-    *this *= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this *= mpreal(v);
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned long int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator*(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-  mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// / Division
-inline mpreal& mpreal::operator/=(const mpreal& v)
-{
-    mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpz_t v)
-{
-    mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpq_t v)
-{
-    mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long double v)
-{
-    *this /= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this /= mpreal(v);
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned long int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator/(const mpreal& a, const mpreal& b)
-{
-  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
-  mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-  return c;
-}
-
-inline const mpreal operator/(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    x /= a;
-    return x;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Shifts operators - Multiplication/Division by power of 2
-inline mpreal& mpreal::operator<<=(const unsigned long int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const unsigned int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const long int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned long int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const long int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
-{
-    return mul_2ui(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned int k)
-{
-    return mul_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator<<(const mpreal& v, const long int k)
-{
-    return mul_2si(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const int k)
-{
-    return mul_2si(v,static_cast<long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
-{
-    return div_2ui(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const long int k)
-{
-    return div_2si(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned int k)
-{
-    return div_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const int k)
-{
-    return div_2si(v,static_cast<long int>(k));
-}
-
-// mul_2ui
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-// mul_2si
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//Relational operators
-
-// WARNING:
-//
-// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
-//
-// isnan(b) =  (b != b)
-// isnan(b) = !(b == b)  (we use in code below)
-//
-// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
-// Use std::isnan instead (C++11).
-
-inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
-inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
-inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
-
-inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
-inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-// inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
-inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
-
-inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
-inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
-inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
-
-inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
-inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
-inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
-
-inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
-inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
-inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
-
-inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
-
-inline bool (isnan)    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isfinite) (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
-inline bool iszero   (const mpreal& op){    return (mpfr_zero_p   (op.mpfr_srcptr()) != 0 );    }
-inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcptr()) != 0 );    }
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Type Converters
-inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
-inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
-inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
-inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
-inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
-inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
-inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
-inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
-
-inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_srcptr() const    { return mp; }
-
-template <class T>
-inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
-{
-    std::ostringstream oss;
-    oss << f << t;
-    return oss.str();
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline std::string mpreal::toString(const std::string& format) const
-{
-    char *s = NULL;
-    std::string out;
-
-    if( !format.empty() )
-    {
-        if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
-        {
-            out = std::string(s);
-
-            mpfr_free_str(s);
-        }
-    }
-
-    return out;
-}
-
-#endif
-
-inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
-{
-    // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
-    (void)b;
-    (void)mode;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-    std::ostringstream format;
-
-    int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
-
-    format << "%." << digits << "RNg";
-
-    return toString(format.str());
-
-#else
-
-    char *s, *ns = NULL;
-    size_t slen, nslen;
-    mp_exp_t exp;
-    std::string out;
-
-    if(mpfr_inf_p(mp))
-    {
-        if(mpfr_sgn(mp)>0) return "+Inf";
-        else               return "-Inf";
-    }
-
-    if(mpfr_zero_p(mp)) return "0";
-    if(mpfr_nan_p(mp))  return "NaN";
-
-    s  = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
-    ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
-
-    if(s!=NULL && ns!=NULL)
-    {
-        slen  = strlen(s);
-        nslen = strlen(ns);
-        if(nslen<=slen)
-        {
-            mpfr_free_str(s);
-            s = ns;
-            slen = nslen;
-        }
-        else {
-            mpfr_free_str(ns);
-        }
-
-        // Make human eye-friendly formatting if possible
-        if (exp>0 && static_cast<size_t>(exp)<slen)
-        {
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp) ptr--;
-
-                if(ptr==s+exp) out = std::string(s,exp+1);
-                else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
-
-                //out = string(s,exp+1)+'.'+string(s+exp+1);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp-1) ptr--;
-
-                if(ptr==s+exp-1) out = std::string(s,exp);
-                else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
-
-                //out = string(s,exp)+'.'+string(s+exp);
-            }
-
-        }else{ // exp<0 || exp>slen
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+1) ptr--;
-
-                if(ptr==s+1) out = std::string(s,2);
-                else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
-
-                //out = string(s,2)+'.'+string(s+2);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s) ptr--;
-
-                if(ptr==s) out = std::string(s,1);
-                else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
-
-                //out = string(s,1)+'.'+string(s+1);
-            }
-
-            // Make final string
-            if(--exp)
-            {
-                if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
-                else       out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
-            }
-        }
-
-        mpfr_free_str(s);
-        return out;
-    }else{
-        return "conversion error!";
-    }
-#endif
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-// I/O
-inline std::ostream& mpreal::output(std::ostream& os) const
-{
-    std::ostringstream format;
-    const std::ios::fmtflags flags = os.flags();
-
-    format << ((flags & std::ios::showpos) ? "%+" : "%");
-    if (os.precision() >= 0)
-        format << '.' << os.precision() << "R*"
-               << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
-                   (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
-                   'g');
-    else
-        format << "R*e";
-
-    char *s = NULL;
-    if(!(mpfr_asprintf(&s, format.str().c_str(),
-                        mpfr::mpreal::get_default_rnd(),
-                        mpfr_srcptr())
-        < 0))
-    {
-        os << std::string(s);
-        mpfr_free_str(s);
-    }
-    return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
-{
-    return v.output(os);
-}
-
-inline std::istream& operator>>(std::istream &is, mpreal& v)
-{
-    // TODO: use cout::hexfloat and other flags to setup base
-    std::string tmp;
-    is >> tmp;
-    mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
-    return is;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//     Bits - decimal digits relation
-//        bits   = ceil(digits*log[2](10))
-//        digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d)
-{
-    const double LOG2_10 = 3.3219280948873624;
-
-    return mp_prec_t(std::ceil( d * LOG2_10 ));
-}
-
-inline int bits2digits(mp_prec_t b)
-{
-    const double LOG10_2 = 0.30102999566398119;
-
-    return int(std::floor( b * LOG10_2 ));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get number properties
-inline int sgn(const mpreal& op)
-{
-    return mpfr_sgn(op.mpfr_srcptr());
-}
-
-inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
-{
-    mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), (sign < 0 ? 1 : 0), RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline int mpreal::getPrecision() const
-{
-    return int(mpfr_get_prec(mpfr_srcptr()));
-}
-
-inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
-{
-    mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setInf(int sign)
-{
-    mpfr_set_inf(mpfr_ptr(), sign);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setNan()
-{
-    mpfr_set_nan(mpfr_ptr());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setZero(int sign)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    mpfr_set_zero(mpfr_ptr(), sign);
-#else
-    mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
-    setSign(sign);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mp_prec_t mpreal::get_prec() const
-{
-    return mpfr_get_prec(mpfr_srcptr());
-}
-
-inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
-{
-    mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mp_exp_t mpreal::get_exp ()
-{
-    return mpfr_get_exp(mpfr_srcptr());
-}
-
-inline int mpreal::set_exp (mp_exp_t e)
-{
-    int x = mpfr_set_exp(mpfr_ptr(), e);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return x;
-}
-
-inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpreal y(x);
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
-#else
-    *exp = mpfr_get_exp(y.mpfr_srcptr());
-    mpfr_set_exp(y.mpfr_ptr(),0);
-#endif
-    return y;
-}
-
-inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
-{
-    mpreal x(v);
-
-    // rounding is not important since we are just increasing the exponent (= exact operation)
-    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
-{
-    return ldexp(v, exp);
-}
-
-inline mpreal machine_epsilon(mp_prec_t prec)
-{
-    /* the smallest eps such that 1 + eps != 1 */
-    return machine_epsilon(mpreal(1, prec));
-}
-
-inline mpreal machine_epsilon(const mpreal& x)
-{
-    /* the smallest eps such that x + eps != x */
-    if( x < 0)
-    {
-        return nextabove(-x) + x;
-    }else{
-        return nextabove( x) - x;
-    }
-}
-
-// minval is 'safe' meaning 1 / minval does not overflow
-inline mpreal minval(mp_prec_t prec)
-{
-    /* min = 1/2 * 2^emin = 2^(emin - 1) */
-    return mpreal(1, prec) << mpreal::get_emin()-1;
-}
-
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal maxval(mp_prec_t prec)
-{
-    /* max = (1 - eps) * 2^emax, eps is machine epsilon */
-    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
-}
-
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
-{
-    return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
-{
-    return abs(a - b) <= eps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
-{
-    return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// C++11 sign functions.
-inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
-    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
-    return rop;
-}
-
-inline bool signbit(const mpreal& x)
-{
-    return mpfr_signbit(x.mpfr_srcptr());
-}
-
-inline const mpreal modf(const mpreal& v, mpreal& n)
-{
-    mpreal f(v);
-
-    // rounding is not important since we are using the same number
-    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
-    mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
-    return f;
-}
-
-inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
-{
-    return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
-}
-
-inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
-{
-    int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return r;
-}
-
-inline mp_exp_t mpreal::get_emin (void)
-{
-    return mpfr_get_emin();
-}
-
-inline int mpreal::set_emin (mp_exp_t exp)
-{
-    return mpfr_set_emin(exp);
-}
-
-inline mp_exp_t mpreal::get_emax (void)
-{
-    return mpfr_get_emax();
-}
-
-inline int mpreal::set_emax (mp_exp_t exp)
-{
-    return mpfr_set_emax(exp);
-}
-
-inline mp_exp_t mpreal::get_emin_min (void)
-{
-    return mpfr_get_emin_min();
-}
-
-inline mp_exp_t mpreal::get_emin_max (void)
-{
-    return mpfr_get_emin_max();
-}
-
-inline mp_exp_t mpreal::get_emax_min (void)
-{
-    return mpfr_get_emax_min();
-}
-
-inline mp_exp_t mpreal::get_emax_max (void)
-{
-    return mpfr_get_emax_max();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Mathematical Functions
-//////////////////////////////////////////////////////////////////////////
-#define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
-        mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
-        mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
-        return y;
-
-inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
-
-inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt);    }
-
-inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
-{
-    mpreal y;
-    mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
-    return y;
-}
-
-inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
-{
-    return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-}
-
-inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
-    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
-    return y;
-}
-
-inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
-    return y;
-}
-
-inline int cmpabs(const mpreal& a,const mpreal& b)
-{
-    return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
-}
-
-inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
-}
-
-inline const mpreal sqrt  (const long double v, mp_rnd_t rnd_mode)    {   return sqrt(mpreal(v),rnd_mode);    }
-inline const mpreal sqrt  (const double v, mp_rnd_t rnd_mode)         {   return sqrt(mpreal(v),rnd_mode);    }
-
-inline const mpreal cbrt  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt );    }
-inline const mpreal fabs  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal abs   (const mpreal& x, mp_rnd_t r)                             {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal log   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log  );    }
-inline const mpreal log2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log2 );    }
-inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log10);    }
-inline const mpreal exp   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp  );    }
-inline const mpreal exp2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 );    }
-inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp10);    }
-inline const mpreal cos   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cos  );    }
-inline const mpreal sin   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sin  );    }
-inline const mpreal tan   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tan  );    }
-inline const mpreal sec   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sec  );    }
-inline const mpreal csc   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csc  );    }
-inline const mpreal cot   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cot  );    }
-inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acos );    }
-inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
-inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
-
-inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
-
-inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
-inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
-inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
-inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atanh(1/v, r);                      }
-inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acosh(1/v, r);                      }
-inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asinh(1/v, r);                      }
-
-inline const mpreal cosh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cosh );    }
-inline const mpreal sinh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sinh );    }
-inline const mpreal tanh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tanh );    }
-inline const mpreal sech  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sech );    }
-inline const mpreal csch  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csch );    }
-inline const mpreal coth  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(coth );    }
-inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acosh);    }
-inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asinh);    }
-inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atanh);    }
-
-inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log1p  );    }
-inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
-inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
-inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
-inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
-inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
-inline const mpreal erfc    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erfc   );    }
-inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j0     );    }
-inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j1     );    }
-inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y0     );    }
-inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y1     );    }
-
-inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec     = mpreal::get_default_prec(),
-                                           mp_rnd_t  rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(0, prec);
-    mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
-    return x;
-}
-
-
-inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    int tsignp;
-
-    if(signp)   mpfr_lgamma(x.mpfr_ptr(),  signp,v.mpfr_srcptr(),rnd_mode);
-    else        mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
-
-    return x;
-}
-
-
-inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-    p3 = v3.get_prec();
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-    p3 = v3.get_prec();
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2;
-
-    p1 = v1.get_prec();
-    p2 = v2.get_prec();
-
-    a.set_prec(p1>p2?p1:p2);
-
-    mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpfr_srcptr *p = new mpfr_srcptr[n];
-
-    for (unsigned long int  i = 0; i < n; i++)
-        p[i] = tab[i].mpfr_srcptr();
-
-    mpreal x;
-    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
-
-    delete [] p;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
-}
-
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
-}
-
-inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    /*  R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
-    return fmod(x, y, rnd_mode);
-}
-
-inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    (void)rnd_mode;
-
-    /*
-
-    m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
-
-    The following are true by convention:
-    - mod(x,0) is x
-    - mod(x,x) is 0
-    - mod(x,y) for x != y and y != 0 has the same sign as y.
-
-    */
-
-    if(iszero(y)) return x;
-    if(x == y) return 0;
-
-    mpreal m = x - floor(x / y) * y;
-
-    m.setSign(sgn(y)); // make sure result has the same sign as Y
-
-    return m;
-}
-
-inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t yp, xp;
-
-    yp = y.get_prec();
-    xp = x.get_prec();
-
-    a.set_prec(yp>xp?yp:xp);
-
-    mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
-    return x;
-}
-#endif //  MPFR 2.4.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 3.0.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(digamma);     }
-inline const mpreal ai      (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(ai);          }
-#endif // MPFR 3.0.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// Constants
-inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_log2(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_pi(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_euler(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_catalan(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
-{
-    mpreal x(0, p);
-    mpfr_set_inf(x.mpfr_ptr(), sign);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Integer Related Functions
-inline const mpreal ceil(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_ceil(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal floor(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_floor(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal round(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_round(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal trunc(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_trunc(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal rint       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint      );     }
-inline const mpreal rint_ceil  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil );     }
-inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor);     }
-inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round);     }
-inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc);     }
-inline const mpreal frac       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(frac      );     }
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline void         swap (mpreal& a, mpreal& b)            {    mpfr_swap(a.mp,b.mp);   }
-inline const mpreal (max)(const mpreal& x, const mpreal& y){    return (x>y?x:y);       }
-inline const mpreal (min)(const mpreal& x, const mpreal& y){    return (x<y?x:y);       }
-
-inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fmin(const mpreal& x, const mpreal& y,  mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
-{
-    mpreal a(x);
-    mpfr_nexttoward(a.mp,y.mp);
-    return a;
-}
-
-inline const mpreal nextabove  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextabove(a.mp);
-    return a;
-}
-
-inline const mpreal nextbelow  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextbelow(a.mp);
-    return a;
-}
-
-inline const mpreal urandomb (gmp_randstate_t& state)
-{
-    mpreal x;
-    mpfr_urandomb(x.mpfr_ptr(),state);
-    return x;
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
-    return x;
-}
-#endif
-
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
-{
-    mpreal x;
-    mpfr_random2(x.mpfr_ptr(),size,exp);
-    return x;
-}
-#endif
-
-// Uniformly distributed random number generation
-// a = random(seed); <- initialization & first random number generation
-// a = random();     <- next random numbers generation
-// seed != 0
-inline const mpreal random(unsigned int seed = 0)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0)    gmp_randseed_ui(state,seed);
-
-    return mpfr::urandom(state);
-#else
-    if(seed != 0)    std::srand(seed);
-    return mpfr::mpreal(std::rand()/(double)RAND_MAX);
-#endif
-
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
-    return x;
-}
-
-inline const mpreal grandom(unsigned int seed = 0)
-{
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0) gmp_randseed_ui(state,seed);
-
-    return mpfr::grandom(state);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get global properties
-inline void mpreal::set_default_prec(mp_prec_t prec)
-{
-    mpfr_set_default_prec(prec);
-}
-
-inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{
-    mpfr_set_default_rounding_mode(rnd_mode);
-}
-
-inline bool mpreal::fits_in_bits(double x, int n)
-{
-    int i;
-    double t;
-    return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
-}
-
-inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-// pow unsigned long int
-inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    mpreal x(a);
-    mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow unsigned int
-inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow long int
-inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow int
-inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow long double
-inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-} // End of mpfr namespace
-
-// Explicit specialization of std::swap for mpreal numbers
-// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
-// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
-namespace std
-{
-  // we are allowed to extend namespace std with specializations only
-    template <>
-    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
-    {
-        return mpfr::swap(x, y);
-    }
-
-    template<>
-    class numeric_limits<mpfr::mpreal>
-    {
-    public:
-        static const bool is_specialized    = true;
-        static const bool is_signed         = true;
-        static const bool is_integer        = false;
-        static const bool is_exact          = false;
-        static const int  radix             = 2;
-
-        static const bool has_infinity      = true;
-        static const bool has_quiet_NaN     = true;
-        static const bool has_signaling_NaN = true;
-
-        static const bool is_iec559         = true;        // = IEEE 754
-        static const bool is_bounded        = true;
-        static const bool is_modulo         = false;
-        static const bool traps             = true;
-        static const bool tinyness_before   = true;
-
-        static const float_denorm_style has_denorm  = denorm_absent;
-
-        inline static mpfr::mpreal (min)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::minval(precision);  }
-        inline static mpfr::mpreal (max)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::maxval(precision);  }
-        inline static mpfr::mpreal lowest   (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return -mpfr::maxval(precision);  }
-
-        // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
-        inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
-
-        // Returns smallest eps such that x + eps != x (relative machine epsilon)
-        inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
-
-        inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision);
-            else               return mpfr::mpreal(1.0, precision);
-        }
-
-        inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
-        inline static const mpfr::mpreal quiet_NaN()        { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal signaling_NaN()    { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal denorm_min()       { return (min)();                    }
-
-        // Please note, exponent range is not fixed in MPFR
-        static const int min_exponent = MPFR_EMIN_DEFAULT;
-        static const int max_exponent = MPFR_EMAX_DEFAULT;
-        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
-        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
-
-#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
-
-        // Following members should be constant according to standard, but they can be variable in MPFR
-        // So we define them as functions here.
-        //
-        // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
-        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
-        // See below for compatible implementation.
-        inline static float_round_style round_style()
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            switch (r)
-            {
-            case GMP_RNDN: return round_to_nearest;
-            case GMP_RNDZ: return round_toward_zero;
-            case GMP_RNDU: return round_toward_infinity;
-            case GMP_RNDD: return round_toward_neg_infinity;
-            default: return round_indeterminate;
-            }
-        }
-
-        inline static int digits()                        {    return int(mpfr::mpreal::get_default_prec());    }
-        inline static int digits(const mpfr::mpreal& x)   {    return x.getPrecision();                         }
-
-        inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return mpfr::bits2digits(precision);
-        }
-
-        inline static int digits10(const mpfr::mpreal& x)
-        {
-            return mpfr::bits2digits(x.getPrecision());
-        }
-
-        inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return digits10(precision);
-        }
-#else
-        // Digits and round_style are NOT constants when it comes to mpreal.
-        // If possible, please use functions digits() and round_style() defined above.
-        //
-        // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
-        // Change them accordingly to your application.
-        //
-        // For example, if you use 256 bits of precision uniformly in your program, then:
-        // digits       = 256
-        // digits10     = 77
-        // max_digits10 = 78
-        //
-        // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
-
-        static const std::float_round_style round_style = round_to_nearest;
-        static const int digits       = 53;
-        static const int digits10     = 15;
-        static const int max_digits10 = 16;
-#endif
-    };
-
-}
-
-#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 685e7ea45..10beb0714 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -1,3 +1,4 @@
+#include <mpreal.h>  // Must be included before main.h.
 #include "main.h"
 #include <Eigen/MPRealSupport>
 #include <Eigen/LU>
@@ -7,7 +8,7 @@
 using namespace mpfr;
 using namespace Eigen;
 
-void test_mpreal_support()
+EIGEN_DECLARE_TEST(mpreal_support)
 {
   // set precision to 256 bits (double has only 53 bits)
   mpreal::set_default_prec(256);
diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp
index 706a816f7..1c4438134 100644
--- a/unsupported/test/openglsupport.cpp
+++ b/unsupported/test/openglsupport.cpp
@@ -9,15 +9,24 @@
 
 #include <main.h>
 #include <iostream>
+#include <string>
+
+#if defined(__APPLE_CC__)
+  // Prevent deprecation warnings caused by GLEW on MacOS.
+  #define GL_SILENCE_DEPRECATION 1
+#endif
 #include <GL/glew.h>
 #include <Eigen/OpenGLSupport>
-#include <GL/glut.h>
-using namespace Eigen;
-
-
+#if defined(__APPLE_CC__)
+  #include <GLUT/glut.h>
+#else
+  #include <GL/freeglut.h>
+#endif
 
+using namespace Eigen;
 
 #define VERIFY_MATRIX(CODE,REF) { \
+    glMatrixMode(GL_MODELVIEW); \
     glLoadIdentity(); \
     CODE; \
     Matrix<float,4,4,ColMajor> m; m.setZero(); \
@@ -40,7 +49,7 @@ using namespace Eigen;
     } \
     VERIFY_IS_APPROX(value, data); \
   }
-  
+
 #define VERIFY_UNIFORMi(NAME,TYPE) { \
     TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \
     TYPE data; \
@@ -53,175 +62,324 @@ using namespace Eigen;
     } \
     VERIFY_IS_APPROX(value, data); \
   }
-  
-void printInfoLog(GLuint objectID)
+
+void printProgramInfoLog(GLuint objectID)
 {
     int infologLength, charsWritten;
     GLchar *infoLog;
-    glGetProgramiv(objectID,GL_INFO_LOG_LENGTH, &infologLength);
+    glGetProgramiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
     if(infologLength > 0)
     {
         infoLog = new GLchar[infologLength];
         glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog);
-        if (charsWritten>0)
+        if (charsWritten > 0)
+          std::cerr << "Program info : \n" << infoLog << std::endl;
+        delete[] infoLog;
+    }
+}
+
+void printShaderInfoLog(GLuint objectID)
+{
+    int infologLength, charsWritten;
+    GLchar *infoLog;
+    glGetShaderiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
+    if(infologLength > 0)
+    {
+        infoLog = new GLchar[infologLength];
+        glGetShaderInfoLog(objectID, infologLength, &charsWritten, infoLog);
+        if (charsWritten > 0)
           std::cerr << "Shader info : \n" << infoLog << std::endl;
         delete[] infoLog;
     }
 }
 
-GLint createShader(const char* vtx, const char* frg)
+GLint createProgram(const char* vtx, const char* frg, bool print_errors = true)
 {
   GLint prg_id = glCreateProgram();
   GLint vtx_id = glCreateShader(GL_VERTEX_SHADER);
   GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER);
   GLint ok;
-  
+
   glShaderSource(vtx_id, 1, &vtx, 0);
   glCompileShader(vtx_id);
-  glGetShaderiv(vtx_id,GL_COMPILE_STATUS,&ok);
+  glGetShaderiv(vtx_id, GL_COMPILE_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "vtx compilation failed\n";
+    if (print_errors)
+    {
+      std::cerr << "vtx compilation failed\n";
+      std::cerr << "Source:\n" << vtx << "\n";
+      printShaderInfoLog(vtx_id);
+    }
+    glDeleteShader(vtx_id);
+    return GL_ZERO;
   }
-  
+
   glShaderSource(frg_id, 1, &frg, 0);
   glCompileShader(frg_id);
-  glGetShaderiv(frg_id,GL_COMPILE_STATUS,&ok);
+  glGetShaderiv(frg_id, GL_COMPILE_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "frg compilation failed\n";
+    if (print_errors)
+    {
+      std::cerr << "frg compilation failed.\n";
+      std::cerr << "Source:\n" << frg << "\n";
+      printShaderInfoLog(frg_id);
+    }
+    glDeleteShader(vtx_id);
+    glDeleteShader(frg_id);
+    return GL_ZERO;
   }
-  
+
   glAttachShader(prg_id, vtx_id);
   glAttachShader(prg_id, frg_id);
   glLinkProgram(prg_id);
-  glGetProgramiv(prg_id,GL_LINK_STATUS,&ok);
+
+  // Delete shaders once linked.
+  glDeleteShader(vtx_id);
+  glDeleteShader(frg_id);
+  glGetProgramiv(prg_id, GL_LINK_STATUS, &ok);
   if(!ok)
   {
-    std::cerr << "linking failed\n";
+    if (print_errors)
+    {
+      std::cerr << "linking failed.\n";
+      printProgramInfoLog(prg_id);
+    }
+    glDeleteProgram(prg_id);
+    return GL_ZERO;
   }
-  printInfoLog(prg_id);
-  
+
   glUseProgram(prg_id);
   return prg_id;
 }
 
-void test_openglsupport()
+GLint createProgram(const std::string& vtx, const std::string& frg, bool print_errors = true)
 {
-  int argc = 0;
-  glutInit(&argc, 0);
-  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
-  glutInitWindowPosition (0,0);
-  glutInitWindowSize(10, 10);
+  return createProgram(vtx.c_str(), frg.c_str(), print_errors);
+}
 
-  if(glutCreateWindow("Eigen") <= 0)
+std::string getGlslVersionString(int gl_major_version, int gl_minor_version)
+{
+  switch (gl_major_version)
   {
-    std::cerr << "Error: Unable to create GLUT Window.\n";
-    exit(1);
+    case 2:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 110";
+        case 1:
+          return "#version 120";
+      }
+      break;
+    case 3:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 130";
+        case 1:
+          return "#version 140";
+        case 2:
+          return "#version 150";
+        case 3:
+          return "#version 330";
+      }
+      break;
+    case 4:
+      switch (gl_minor_version)
+      {
+        case 0:
+          return "#version 400";
+        case 1:
+          return "#version 410";
+        case 2:
+          return "#version 420";
+        case 3:
+          return "#version 430";
+        case 4:
+          return "#version 440";
+        case 5:
+          return "#version 450";
+        case 6:
+          return "#version 460";
+      }
+      break;
   }
-  
-  glewExperimental = GL_TRUE;
-  if(glewInit() != GLEW_OK)
-  {
-    std::cerr << "Warning: Failed to initialize GLEW\n";
+  return "";
+}
+
+void find_and_replace(
+  std::string& str,
+  const std::string& find,
+  const std::string& replace)
+{
+  size_t loc = 0;
+  size_t flen = find.length();
+  size_t rlen = replace.length();
+  while ( (loc = str.find(find, loc)) != std::string::npos) {
+    str.replace(loc, flen, replace);
+    loc += rlen;
   }
+}
 
-  Vector3f v3f;
-  Matrix3f rot;
-  glBegin(GL_POINTS);
-  
-  glVertex(v3f);
-  glVertex(2*v3f+v3f);
-  glVertex(rot*v3f);
-  
-  glEnd();
-  
-  // 4x4 matrices
-  Matrix4f mf44; mf44.setRandom();
-  VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
-  VERIFY_MATRIX(glMultMatrix(mf44), mf44);
-  Matrix4d md44; md44.setRandom();
-  VERIFY_MATRIX(glLoadMatrix(md44), md44);
-  VERIFY_MATRIX(glMultMatrix(md44), md44);
-  
-  // Quaternion
-  Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
-  VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
-  
-  Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
-  VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
-  
-  // 3D Transform
-  Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
-  VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
-  
-  Transform<float,3,Affine> af3(acf3);
-  VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
-  VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
-  
-  Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
-  VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
-  
-  Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
-  VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
-  
-  Transform<double,3,Affine> ad3(acd3);
-  VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
-  VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
-  
-  Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
-  VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
-  VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
-  
-  // translations (2D and 3D)
-  {
-    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
-    VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
-    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
-    VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
-    
-    Vector3f vf3; vf3.setRandom();
-    VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
-    Vector3d vd3; vd3.setRandom();
-    VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
-    
-    Translation<float,3> tf3; tf3.vector().setRandom();
-    VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
-    
-    Translation<double,3> td3;  td3.vector().setRandom();
-    VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+// Finds and replaces a set of substrings in a string.
+std::string format(
+  const std::string& str,
+  const std::vector<std::string>& find,
+  const std::vector<std::string>& replace)
+{
+  std::string out = str;
+  for (std::size_t i=0; i<find.size(); ++i) {
+    find_and_replace(out, find[i], replace[i]);
   }
-  
-  // scaling (2D and 3D)
+  return out;
+}
+
+// GLUT display function that runs test.  Must be run within the display loop
+// in order to properly destroy resources.
+void openglsupport_test_loop()
+{
+  // Get context info.
+  const GLubyte* gl_version_string = glGetString(GL_VERSION);
+  std::cerr << "GL version: " << gl_version_string << std::endl;
+  std::cerr << "GLSL version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;
+  // Parse version from string since GL_MAJOR_VERSION is only supported in GL 3.0+.
+  // Version string guaranteed to be <major>.<minor><vender extension>.
+  GLint gl_major_version = gl_version_string[0] - '0';
+  GLint gl_minor_version = gl_version_string[2] - '0';
+  bool legacy_gl = gl_major_version < 3 || (gl_major_version == 3 && gl_minor_version < 2);
+
+  // Fixed-function pipeline removed in OpenGL 3.2.
+  if (legacy_gl)
   {
-    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
-    VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
-    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
-    VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
-    
-    Vector3f vf3; vf3.setRandom();
-    VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
-    Vector3d vd3; vd3.setRandom();
-    VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
-    
-    UniformScaling<float> usf(internal::random<float>());
-    VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
-    
-    UniformScaling<double> usd(internal::random<double>());
-    VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    // Draw a basic triangle.
+    Vector3f v3f;
+    Matrix3f rot;
+    glBegin(GL_POINTS);
+    {
+      glVertex(v3f);
+      glVertex(2*v3f+v3f);
+      glVertex(rot*v3f);
+    }
+    glEnd();
+
+    // 4x4 matrices
+    Matrix4f mf44; mf44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
+    VERIFY_MATRIX(glMultMatrix(mf44), mf44);
+    Matrix4d md44; md44.setRandom();
+    VERIFY_MATRIX(glLoadMatrix(md44), md44);
+    VERIFY_MATRIX(glMultMatrix(md44), md44);
+
+    // Quaternion
+    Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
+    VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
+
+    Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
+    VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
+
+    // 3D Transform
+    Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
+
+    Transform<float,3,Affine> af3(acf3);
+    VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
+    VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
+
+    Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
+
+    Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
+
+    Transform<double,3,Affine> ad3(acd3);
+    VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
+    VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
+
+    Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
+    VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
+    VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
+
+    // translations (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
+      VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
+      VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
+
+      Translation<float,3> tf3; tf3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
+
+      Translation<double,3> td3;  td3.vector().setRandom();
+      VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+    }
+
+    // scaling (2D and 3D)
+    {
+      Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
+      VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
+      Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
+      VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
+
+      Vector3f vf3; vf3.setRandom();
+      VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
+      Vector3d vd3; vd3.setRandom();
+      VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
+
+      UniformScaling<float> usf(internal::random<float>());
+      VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
+
+      UniformScaling<double> usd(internal::random<double>());
+      VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+    }
+  } else {
+    std::cerr << "Warning: fixed-function pipeline was not tested.\n";
+  }
+
+  // Dynamic shader substitution variables.
+  // Modern shaders require a version string, and newer runtimes fail to
+  // compile old GLSL versions. Thus, we dynamically set the GLSL version
+  // string based on runtime. Also, pre OpenGL 3.0, the output gl_FragColor was
+  // built-in. This was deprecated in OpenGL 3.0, requiring us to explicitly
+  // define the output variable.
+  std::vector<std::string> glsl_vars;
+  glsl_vars.push_back("${GLSL_VERSION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_DECLARATION}");
+  glsl_vars.push_back("${FRAG_OUTPUT_VARIABLE}");
+
+  std::vector<std::string> glsl_vals;
+  glsl_vals.push_back(getGlslVersionString(gl_major_version, gl_minor_version));
+  if (gl_major_version >= 3) {
+    glsl_vals.push_back("out vec4 fragColor;");
+    glsl_vals.push_back("fragColor");
+  } else {
+    glsl_vals.push_back("");
+    glsl_vals.push_back("gl_FragColor");
   }
-  
+
   // uniform
   {
-    const char* vtx = "void main(void) { gl_Position = gl_Vertex; }\n";
-    
-    if(GLEW_VERSION_2_0)
+    // vertex shader.
+    std::string vtx = format(
+      "${GLSL_VERSION}\n"
+      "void main(void) {\n"
+      "  gl_Position = vec4(0,0,0,1);\n"
+      "}\n",
+      glsl_vars, glsl_vals);
+
+#ifdef GL_VERSION_2_0
+    if(GLEW_VERSION_2_0 && GL_VERSION_2_0)
     {
-      #ifdef GL_VERSION_2_0
-      const char* frg = ""
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform vec2 v2f;\n"
         "uniform vec3 v3f;\n"
         "uniform vec4 v4f;\n"
@@ -231,107 +389,212 @@ void test_openglsupport()
         "uniform mat2 m2f;\n"
         "uniform mat3 m3f;\n"
         "uniform mat4 m4f;\n"
-        "void main(void) { gl_FragColor = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
-      VERIFY_UNIFORM(fv,v2f, Vector2f);
-      VERIFY_UNIFORM(fv,v3f, Vector3f);
-      VERIFY_UNIFORM(fv,v4f, Vector4f);
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) { \n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(fv, v2f, Vector2f);
+      VERIFY_UNIFORM(fv, v3f, Vector3f);
+      VERIFY_UNIFORM(fv, v4f, Vector4f);
       VERIFY_UNIFORMi(v2i, Vector2i);
       VERIFY_UNIFORMi(v3i, Vector3i);
       VERIFY_UNIFORMi(v4i, Vector4i);
-      VERIFY_UNIFORM(fv,m2f, Matrix2f);
-      VERIFY_UNIFORM(fv,m3f, Matrix3f);
-      VERIFY_UNIFORM(fv,m4f, Matrix4f);
-      #endif
+      VERIFY_UNIFORM(fv, m2f, Matrix2f);
+      VERIFY_UNIFORM(fv, m3f, Matrix3f);
+      VERIFY_UNIFORM(fv, m4f, Matrix4f);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 2.0 was not tested\n";
-    
-    if(GLEW_VERSION_2_1)
+#endif
+      std::cerr << "Warning: opengl 2.0 was not tested.\n";
+
+#ifdef GL_VERSION_2_1
+    if(GLEW_VERSION_2_1 && GL_VERSION_2_1 &&
+        (gl_major_version > 2 || (gl_major_version == 2 && gl_minor_version >= 1)))
     {
-      #ifdef GL_VERSION_2_1
-      const char* frg = "#version 120\n"
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform mat2x3 m23f;\n"
         "uniform mat3x2 m32f;\n"
         "uniform mat2x4 m24f;\n"
         "uniform mat4x2 m42f;\n"
         "uniform mat3x4 m34f;\n"
         "uniform mat4x3 m43f;\n"
-        "void main(void) { gl_FragColor = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
       typedef Matrix<float,2,3> Matrix23f;
       typedef Matrix<float,3,2> Matrix32f;
       typedef Matrix<float,2,4> Matrix24f;
       typedef Matrix<float,4,2> Matrix42f;
       typedef Matrix<float,3,4> Matrix34f;
       typedef Matrix<float,4,3> Matrix43f;
-      
-      VERIFY_UNIFORM(fv,m23f, Matrix23f);
-      VERIFY_UNIFORM(fv,m32f, Matrix32f);
-      VERIFY_UNIFORM(fv,m24f, Matrix24f);
-      VERIFY_UNIFORM(fv,m42f, Matrix42f);
-      VERIFY_UNIFORM(fv,m34f, Matrix34f);
-      VERIFY_UNIFORM(fv,m43f, Matrix43f);
-      #endif
+
+      VERIFY_UNIFORM(fv, m23f, Matrix23f);
+      VERIFY_UNIFORM(fv, m32f, Matrix32f);
+      VERIFY_UNIFORM(fv, m24f, Matrix24f);
+      VERIFY_UNIFORM(fv, m42f, Matrix42f);
+      VERIFY_UNIFORM(fv, m34f, Matrix34f);
+      VERIFY_UNIFORM(fv, m43f, Matrix43f);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 2.1 was not tested\n";
-    
-    if(GLEW_VERSION_3_0)
+#endif
+      std::cerr << "Warning: opengl 2.1 was not tested.\n";
+
+#ifdef GL_VERSION_3_0
+    if(GLEW_VERSION_3_0 && GL_VERSION_3_0 && gl_major_version >= 3)
     {
-      #ifdef GL_VERSION_3_0
-      const char* frg = "#version 150\n"
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
         "uniform uvec2 v2ui;\n"
         "uniform uvec3 v3ui;\n"
         "uniform uvec4 v4ui;\n"
-        "out vec4 data;\n"
-        "void main(void) { data = vec4(v2ui[0]+v3ui[0]+v4ui[0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2ui[0]+v3ui[0]+v4ui[0]);\n"
+        "}\n",
+        glsl_vars, glsl_vals);
+
+      GLint prg_id = createProgram(vtx, frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
       typedef Matrix<unsigned int,2,1> Vector2ui;
       typedef Matrix<unsigned int,3,1> Vector3ui;
       typedef Matrix<unsigned int,4,1> Vector4ui;
-      
+
       VERIFY_UNIFORMi(v2ui, Vector2ui);
       VERIFY_UNIFORMi(v3ui, Vector3ui);
       VERIFY_UNIFORMi(v4ui, Vector4ui);
-      #endif
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: opengl 3.0 was not tested\n";
-    
-    #ifdef GLEW_ARB_gpu_shader_fp64
+#endif
+      std::cerr << "Warning: opengl 3.0 was not tested.\n";
+
+    // dvecn supported if >= 4.1 or ARB_vertex_attrib_64bit
+    bool has_fp64_native = (gl_major_version == 4 && gl_minor_version >= 1);
+    bool has_fp64_extension = false;
+#ifdef GLEW_ARB_gpu_shader_fp64
     if(GLEW_ARB_gpu_shader_fp64)
     {
-      #ifdef GL_ARB_gpu_shader_fp64
-      const char* frg = "#version 150\n"
+      // Check that extension can actually be compiled.
+      if (has_fp64_extension)
+      {
+        std::string frg = format(
+          "${GLSL_VERSION}\n"
+          "#extension GL_ARB_gpu_shader_fp64 : enable\n"
+          "uniform dvec2 dv2;\n"
+          "${FRAG_OUTPUT_DECLARATION}\n"
+          "void main(void) {\n"
+          "  ${FRAG_OUTPUT_VARIABLE} = vec4(dv2.x, dv2.y, dv2.x, dv2.y);\n"
+          "}\n",
+          glsl_vars, glsl_vals);
+        GLint prg_id = createProgram(vtx, frg, /*print_errors=*/false);
+        if (prg_id)
+        {
+          has_fp64_extension = true;
+          glDeleteProgram(prg_id);
+        }
+      }
+    }
+#endif
+
+    if( has_fp64_native || has_fp64_extension )
+    {
+      std::vector<std::string> glsl_vars_with_extension = glsl_vars;
+      glsl_vars_with_extension.push_back("${GLSL_EXTENSIONS}");
+      std::vector<std::string> glsl_vals_with_extension = glsl_vals;
+      if (has_fp64_extension)
+      {
+        glsl_vals_with_extension.push_back("#extension GL_ARB_gpu_shader_fp64 : enable");
+      }
+      else
+      {
+        glsl_vals_with_extension.push_back("");
+      }
+
+      std::string frg = format(
+        "${GLSL_VERSION}\n"
+        "${GLSL_EXTENSIONS}\n"
         "uniform dvec2 v2d;\n"
         "uniform dvec3 v3d;\n"
         "uniform dvec4 v4d;\n"
-        "out vec4 data;\n"
-        "void main(void) { data = vec4(v2d[0]+v3d[0]+v4d[0]); }\n";
-        
-      GLint prg_id = createShader(vtx,frg);
-      
-      typedef Vector2d Vector2d;
-      typedef Vector3d Vector3d;
-      typedef Vector4d Vector4d;
-      
-      VERIFY_UNIFORM(dv,v2d, Vector2d);
-      VERIFY_UNIFORM(dv,v3d, Vector3d);
-      VERIFY_UNIFORM(dv,v4d, Vector4d);
-      #endif
+        "${FRAG_OUTPUT_DECLARATION}\n"
+        "void main(void) {\n"
+        "  ${FRAG_OUTPUT_VARIABLE} = vec4(v2d[0]+v3d[0]+v4d[0]);\n"
+        "}\n",
+        glsl_vars_with_extension, glsl_vals_with_extension);
+
+      GLint prg_id = createProgram(vtx,frg);
+      VERIFY(prg_id > 0 && "Failed to create program.");
+      VERIFY_UNIFORM(dv, v2d, Vector2d);
+      VERIFY_UNIFORM(dv, v3d, Vector3d);
+      VERIFY_UNIFORM(dv, v4d, Vector4d);
+      glDeleteProgram(prg_id);
     }
     else
-      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
-    #else
-      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
-    #endif
+      std::cerr << "Warning: dvec (fp64) was not tested.\n";
   }
-  
+
+  // Exit loop - Leaving main loop is supported by freeglut, otherwise we
+  // are forced to exit.
+#ifdef FREEGLUT
+  glutLeaveMainLoop();
+  // Trigger another display loop iteration. Otherwise, it just hangs.
+  glutPostRedisplay();
+#else
+  exit(0);
+#endif
+}
+
+EIGEN_DECLARE_TEST(openglsupport)
+{
+  int argc = 0;
+  glutInit(&argc, 0);
+
+  GLint glut_display_mode = GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH;
+
+#ifndef EIGEN_LEGACY_OPENGL
+  // Initialize 3.2+ OpenGL context.
+#if defined(__APPLE_CC__)
+  glut_display_mode |= GLUT_3_2_CORE_PROFILE;
+#elif defined(FREEGLUT)
+  glutInitContextVersion(3, 2);
+  glutInitContextFlags(GLUT_FORWARD_COMPATIBLE);
+  glutInitContextProfile(GLUT_CORE_PROFILE);
+#endif
+#endif
+
+  glutInitDisplayMode(glut_display_mode);
+  glutInitWindowPosition(0, 0);
+  glutInitWindowSize(10, 10);
+
+  int window = glutCreateWindow("Eigen");
+  if(window <= 0)
+  {
+    std::cerr << "Error: Unable to create GLUT Window.\n";
+    exit(1);
+  }
+
+  glewExperimental = GL_TRUE;
+  if(glewInit() != GLEW_OK)
+  {
+    std::cerr << "Warning: Failed to initialize GLEW.\n";
+    exit(1);
+  }
+
+  // Run test in display, otherwise GLUT fails to clean up and leads to memory
+  // access errors on exit.
+  glutDisplayFunc(openglsupport_test_loop);
+  glutMainLoop();
+  glutDestroyWindow(window);
 }
diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
index 0c87478dd..4ff9bda5a 100644
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp
@@ -26,15 +26,25 @@ struct increment_if_fixed_size
 }
 }
 
+template<typename PolynomialType>
+PolynomialType polyder(const PolynomialType& p)
+{
+  typedef typename PolynomialType::Scalar Scalar;
+  PolynomialType res(p.size());
+  for(Index i=1; i<p.size(); ++i)
+    res[i-1] = p[i]*Scalar(i);
+  res[p.size()-1] = 0.;
+  return res;
+}
 
 template<int Deg, typename POLYNOMIAL, typename SOLVER>
 bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
 {
-  typedef typename POLYNOMIAL::Index Index;
   typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
 
   typedef typename SOLVER::RootsType    RootsType;
-  typedef Matrix<Scalar,Deg,1>          EvalRootsType;
+  typedef Matrix<RealScalar,Deg,1>      EvalRootsType;
 
   const Index deg = pols.size()-1;
 
@@ -44,10 +54,17 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
   psolve.compute( pols );
   const RootsType& roots( psolve.roots() );
   EvalRootsType evr( deg );
+  POLYNOMIAL pols_der = polyder(pols);
+  EvalRootsType der( deg );
   for( int i=0; i<roots.size(); ++i ){
-    evr[i] = std::abs( poly_eval( pols, roots[i] ) ); }
+    evr[i] = std::abs( poly_eval( pols, roots[i] ) );
+    der[i] = numext::maxi(RealScalar(1.), std::abs( poly_eval( pols_der, roots[i] ) ));
+  }
 
-  bool evalToZero = evr.isZero( test_precision<Scalar>() );
+  // we need to divide by the magnitude of the derivative because
+  // with a high derivative is very small error in the value of the root
+  // yiels a very large error in the polynomial evaluation.
+  bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision<Scalar>() );
   if( !evalToZero )
   {
     cerr << "WRONG root: " << endl;
@@ -57,7 +74,7 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
     cerr << endl;
   }
 
-  std::vector<Scalar> rootModuli( roots.size() );
+  std::vector<RealScalar> rootModuli( roots.size() );
   Map< EvalRootsType > aux( &rootModuli[0], roots.size() );
   aux = roots.array().abs();
   std::sort( rootModuli.begin(), rootModuli.end() );
@@ -83,7 +100,7 @@ void evalSolver( const POLYNOMIAL& pols )
 {
   typedef typename POLYNOMIAL::Scalar Scalar;
 
-  typedef PolynomialSolver<Scalar, Deg >              PolynomialSolverType;
+  typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
 
   PolynomialSolverType psolve;
   aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve );
@@ -97,6 +114,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
 {
   using std::sqrt;
   typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
 
   typedef PolynomialSolver<Scalar, Deg >              PolynomialSolverType;
 
@@ -107,15 +125,12 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
     // 1) the roots found are correct
     // 2) the roots have distinct moduli
 
-    typedef typename POLYNOMIAL::Scalar                 Scalar;
-    typedef typename REAL_ROOTS::Scalar                 Real;
-
     //Test realRoots
-    std::vector< Real > calc_realRoots;
-    psolve.realRoots( calc_realRoots );
-    VERIFY( calc_realRoots.size() == (size_t)real_roots.size() );
+    std::vector< RealScalar > calc_realRoots;
+    psolve.realRoots( calc_realRoots,  test_precision<RealScalar>());
+    VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() );
 
-    const Scalar psPrec = sqrt( test_precision<Scalar>() );
+    const RealScalar psPrec = sqrt( test_precision<RealScalar>() );
 
     for( size_t i=0; i<calc_realRoots.size(); ++i )
     {
@@ -138,7 +153,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
 
     bool hasRealRoot;
     //Test absGreatestRealRoot
-    Real r = psolve.absGreatestRealRoot( hasRealRoot );
+    RealScalar r = psolve.absGreatestRealRoot( hasRealRoot );
     VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
     if( hasRealRoot ){
       VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) );  }
@@ -167,9 +182,11 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
 template<typename _Scalar, int _Deg>
 void polynomialsolver(int deg)
 {
-  typedef internal::increment_if_fixed_size<_Deg>            Dim;
+  typedef typename NumTraits<_Scalar>::Real RealScalar;
+  typedef internal::increment_if_fixed_size<_Deg>     Dim;
   typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
   typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+  typedef Matrix<RealScalar,_Deg,1>                   RealRootsType;
 
   cout << "Standard cases" << endl;
   PolynomialType pols = PolynomialType::Random(deg+1);
@@ -182,19 +199,15 @@ void polynomialsolver(int deg)
   evalSolver<_Deg,PolynomialType>( pols );
 
   cout << "Test sugar" << endl;
-  EvalRootsType realRoots = EvalRootsType::Random(deg);
+  RealRootsType realRoots = RealRootsType::Random(deg);
   roots_to_monicPolynomial( realRoots, pols );
   evalSolverSugarFunction<_Deg>(
       pols,
-      realRoots.template cast <
-                    std::complex<
-                         typename NumTraits<_Scalar>::Real
-                         >
-                    >(),
+      realRoots.template cast <std::complex<RealScalar> >().eval(),
       realRoots );
 }
 
-void test_polynomialsolver()
+EIGEN_DECLARE_TEST(polynomialsolver)
 {
   for(int i = 0; i < g_repeat; i++)
   {
@@ -214,5 +227,6 @@ void test_polynomialsolver()
             internal::random<int>(9,13)
             )) );
     CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
+    CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) );
   }
 }
diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp
index 5fc968402..8ff451996 100644
--- a/unsupported/test/polynomialutils.cpp
+++ b/unsupported/test/polynomialutils.cpp
@@ -101,7 +101,7 @@ template<typename _Scalar> void CauchyBounds_scalar()
           internal::random<int>(18,26) )) );
 }
 
-void test_polynomialutils()
+EIGEN_DECLARE_TEST(polynomialutils)
 {
   for(int i = 0; i < g_repeat; i++)
   {
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index a010ceb93..602c2cb84 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -8,10 +8,45 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-// import basic and product tests for deprectaed DynamicSparseMatrix
+// import basic and product tests for deprecated DynamicSparseMatrix
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+
+#define EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA 1
+#endif
+
 #define EIGEN_NO_DEPRECATED_WARNING
-#include "sparse_basic.cpp"
+// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix)
+// has an extra copy-assignment.
+#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
 #include "sparse_product.cpp"
+
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+#include "sparse_basic.cpp"
+#endif
+
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+
 #include <Eigen/SparseExtra>
 
 template<typename SetterType,typename DenseType, typename Scalar, int Options>
@@ -104,10 +139,8 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
     #ifdef EIGEN_UNORDERED_MAP_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
-    #ifdef _DENSE_HASH_MAP_H_
+    #ifdef EIGEN_GOOGLEHASH_SUPPORT
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
-    #endif
-    #ifdef _SPARSE_HASH_MAP_H_
     VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
     #endif
 
@@ -129,7 +162,32 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
 
 }
 
-void test_sparse_extra()
+
+template<typename SparseMatrixType>
+void check_marketio()
+{
+  typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix;
+  Index rows = internal::random<Index>(1,100);
+  Index cols = internal::random<Index>(1,100);
+  SparseMatrixType m1, m2;
+  m1 = DenseMatrix::Random(rows, cols).sparseView();
+  saveMarket(m1, "sparse_extra.mtx");
+  loadMarket(m2, "sparse_extra.mtx");
+  VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
+}
+
+template<typename VectorType>
+void check_marketio_vector()
+{
+  Index size = internal::random<Index>(1,100);
+  VectorType v1, v2;
+  v1 = VectorType::Random(size);
+  saveMarketVector(v1, "vector_extra.mtx");
+  loadMarketVector(v2, "vector_extra.mtx");
+  VERIFY_IS_EQUAL(v1,v2);
+}
+
+EIGEN_DECLARE_TEST(sparse_extra)
 {
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,50);
@@ -143,5 +201,26 @@ void test_sparse_extra()
 
     CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
     CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
+
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
+    CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
+
+
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) );
+    CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) );
+
+    TEST_SET_BUT_UNUSED_VARIABLE(s);
   }
 }
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 057fb3e92..589bb76e1 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -7,9 +7,21 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <limits.h>
 #include "main.h"
 #include "../Eigen/SpecialFunctions"
 
+// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+template<typename Derived>
+Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
+  return (dense << static_cast<typename Derived::Scalar>(v));
+}
+
+template<typename XprType>
+Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) {
+  return (ci, static_cast<typename XprType::Scalar>(v));
+}
+
 template<typename X, typename Y>
 void verify_component_wise(const X& x, const Y& y)
 {
@@ -64,8 +76,8 @@ template<typename ArrayType> void array_special_functions()
       //   igamma(a, x) = gamma(a, x) / Gamma(a)
       // where Gamma and gamma are considered the standard unnormalized
       // upper and lower incomplete gamma functions, respectively.
-      ArrayType a = m1.abs() + 2;
-      ArrayType x = m2.abs() + 2;
+      ArrayType a = m1.abs() + Scalar(2);
+      ArrayType x = m2.abs() + Scalar(2);
       ArrayType zero = ArrayType::Zero(rows, cols);
       ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
       ArrayType a_m1 = a - one;
@@ -74,6 +86,7 @@ template<typename ArrayType> void array_special_functions()
       ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
       ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
 
+
       // Gamma(a, 0) == Gamma(a)
       VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
 
@@ -81,10 +94,23 @@ template<typename ArrayType> void array_special_functions()
       VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
 
       // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+      VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp());
 
       // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
-      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+      VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp());
+    }
+    {
+      // Verify for large a and x that values are between 0 and 1.
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+      int max_exponent = std::numeric_limits<Scalar>::max_exponent10;
+      ArrayType a = m1.abs() *  Scalar(pow(10., max_exponent - 1));
+      ArrayType x = m2.abs() *  Scalar(pow(10., max_exponent - 1));
+      for (int i = 0; i < a.size(); ++i) {
+        Scalar igam = numext::igamma(a(i), x(i));
+        VERIFY(0 <= igam);
+        VERIFY(igam <= 1);
+      }
     }
 
     {
@@ -93,27 +119,37 @@ template<typename ArrayType> void array_special_functions()
       Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
 
       // location i*6+j corresponds to a_s[i], x_s[j].
-      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
-                              {0.0, 0.6321205588285578, 0.7768698398515702,
-                              0.9816843611112658, 9.999500016666262e-05, 1.0},
-                              {0.0, 0.4275932955291202, 0.608374823728911,
-                              0.9539882943107686, 7.522076445089201e-07, 1.0},
-                              {0.0, 0.01898815687615381, 0.06564245437845008,
-                              0.5665298796332909, 4.166333347221828e-18, 1.0},
-                              {0.0, 0.9999780593618628, 0.9999899967080838,
-                              0.9999996219837988, 0.9991370418689945, 1.0},
-                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
-      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
-                              {1.0, 0.36787944117144233, 0.22313016014842982,
-                                0.018315638888734182, 0.9999000049998333, 0.0},
-                              {1.0, 0.5724067044708798, 0.3916251762710878,
-                                0.04601170568923136, 0.9999992477923555, 0.0},
-                              {1.0, 0.9810118431238462, 0.9343575456215499,
-                                0.4334701203667089, 1.0, 0.0},
-                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
-                                3.7801620118431334e-07, 0.0008629581310054535,
-                                0.0},
-                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      Scalar igamma_s[][6] = {
+          {Scalar(0.0), nan, nan, nan, nan, nan},
+          {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702),
+           Scalar(0.9816843611112658), Scalar(9.999500016666262e-05),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911),
+           Scalar(0.9539882943107686), Scalar(7.522076445089201e-07),
+           Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.01898815687615381),
+           Scalar(0.06564245437845008), Scalar(0.5665298796332909),
+           Scalar(4.166333347221828e-18), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838),
+           Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)},
+          {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0),
+           Scalar(0.5042041932513908)}};
+      Scalar igammac_s[][6] = {
+          {nan, nan, nan, nan, nan, nan},
+          {Scalar(1.0), Scalar(0.36787944117144233),
+           Scalar(0.22313016014842982), Scalar(0.018315638888734182),
+           Scalar(0.9999000049998333), Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878),
+           Scalar(0.04601170568923136), Scalar(0.9999992477923555),
+           Scalar(0.0)},
+          {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499),
+           Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)},
+          {Scalar(1.0), Scalar(2.1940638138146658e-05),
+           Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07),
+           Scalar(0.0008629581310054535), Scalar(0.0)},
+          {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0),
+           Scalar(0.49579580674813944)}};
+
       for (int i = 0; i < 6; ++i) {
         for (int j = 0; j < 6; ++j) {
           if ((std::isnan)(igamma_s[i][j])) {
@@ -133,12 +169,32 @@ template<typename ArrayType> void array_special_functions()
   }
 #endif  // EIGEN_HAS_C99_MATH
 
+  // Check the ndtri function against scipy.special.ndtri
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
+    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
+
+    // ndtri(normal_cdf(x)) ~= x
+    CALL_SUBTEST(
+        ArrayType m1 = ArrayType::Random(32);
+        using std::sqrt;
+
+        ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf();
+        cdf_val = (cdf_val + Scalar(1)) / Scalar(2);
+        verify_component_wise(cdf_val.ndtri(), m1););
+
+  }
+
   // Check the zeta function against scipy.special.zeta
   {
-    ArrayType x(7), q(7), res(7), ref(7);
-    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
-    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
-    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    ArrayType x(10), q(10), res(10), ref(10);
+    x << 1.5,   4, 10.5, 10000.5,    3,      1,    0.9,  2,  3,  4;
+    q <<   2, 1.5,    3,  1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
     CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
@@ -146,22 +202,21 @@ template<typename ArrayType> void array_special_functions()
 
   // digamma
   {
-    ArrayType x(7), res(7), ref(7);
-    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
-    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    ArrayType x(9), res(9), ref(9);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1, -2, -3;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, nan, nan, nan, nan;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
 
     CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
   }
 
-
 #if EIGEN_HAS_C99_MATH
   {
-    ArrayType n(11), x(11), res(11), ref(11);
-    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
-    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
-    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    ArrayType n(16), x(16), res(16), ref(16);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8,   42,  147, 170, -1,  0,  1,  2,  3;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1,  64, -1, -2, -3, -4, -5;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927, nan, nan, plusinf, nan, plusinf;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
 
     if(sizeof(RealScalar)>=8) {  // double
@@ -288,8 +343,8 @@ template<typename ArrayType> void array_special_functions()
     ArrayType m3 = ArrayType::Random(32);
     ArrayType one = ArrayType::Constant(32, Scalar(1.0));
     const Scalar eps = std::numeric_limits<Scalar>::epsilon();
-    ArrayType a = (m1 * 4.0).exp();
-    ArrayType b = (m2 * 4.0).exp();
+    ArrayType a = (m1 * Scalar(4)).exp();
+    ArrayType b = (m2 * Scalar(4)).exp();
     ArrayType x = m3.abs();
 
     // betainc(a, 1, x) == x**a
@@ -335,11 +390,108 @@ template<typename ArrayType> void array_special_functions()
         ArrayType test = betainc(a, b + one, x) + eps;
         verify_component_wise(test, expected););
   }
-#endif
+#endif  // EIGEN_HAS_C99_MATH
+
+    /* Code to generate the data for the following two test cases.
+    N = 5
+    np.random.seed(3)
+
+    a = np.logspace(-2, 3, 6)
+    a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N]))
+    x = np.random.gamma(a, 1.0)
+    x = np.maximum(x, np.finfo(np.float32).tiny)
+
+    def igamma(a, x):
+      return mpmath.gammainc(a, 0, x, regularized=True)
+
+    def igamma_der_a(a, x):
+      res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a)
+      return np.float64(res)
+
+    def gamma_sample_der_alpha(a, x):
+      igamma_x = igamma(a, x)
+      def igammainv_of_igamma(a_prime):
+        return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) -
+            igamma_x, x, solver='newton')
+      return np.float64(mpmath.diff(igammainv_of_igamma, a))
+
+    v_igamma_der_a = np.vectorize(igamma_der_a)(a, x)
+    v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
+  */
+
+#if EIGEN_HAS_C99_MATH
+  // Test igamma_der_a
+  {
+    ArrayType a(30);
+    ArrayType x(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0,
+        1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514,
+        -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596,
+        -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323,
+        -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522,
+        -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294,
+        -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921,
+        -0.00794899153653, -0.00777303219211, -0.00796085782042,
+        -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+    CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v););
+  }
+
+  // Test gamma_sample_der_alpha
+  {
+    ArrayType alpha(30);
+    ArrayType sample(30);
+    ArrayType res(30);
+    ArrayType v(30);
+
+    alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+        1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+        100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+    sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+        1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+        0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+        0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+        0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+        10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+        86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+        969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+    v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+        1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+        0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+        1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+        0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+        1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+        0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+        1.00106492525, 0.97734200649, 1.02198794179;
+
+    CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
+                 verify_component_wise(res, v););
+  }
+#endif  // EIGEN_HAS_C99_MATH
 }
 
-void test_special_functions()
+EIGEN_DECLARE_TEST(special_functions)
 {
   CALL_SUBTEST_1(array_special_functions<ArrayXf>());
   CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+  // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above.
+  // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>());
+  // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>());
 }
diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
new file mode 100644
index 000000000..31233f1b0
--- /dev/null
+++ b/unsupported/test/special_packetmath.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <limits>
+#include "packetmath_test_shared.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename Scalar,typename Packet> void packetmath_real()
+{
+  using std::abs;
+  typedef internal::packet_traits<Scalar> PacketTraits;
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+  const int size = PacketSize*4;
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
+
+#if EIGEN_HAS_C99_MATH
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+    h.store(data2, internal::plgamma(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  if (internal::packet_traits<Scalar>::HasErf) {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+    h.store(data2, internal::perf(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+    test::packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+    h.store(data2, internal::perfc(h.load(data1)));
+    VERIFY((numext::isnan)(data2[0]));
+  }
+  {
+    for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1));
+    }
+    CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // For bessel_i*e and bessel_j*, the valid range is negative reals.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+    }
+
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
+    CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+  }
+
+  // Use a smaller data range for the bessel_i* as these can become very large.
+  // Following #1693, we also restrict this range further to avoid inf's due to
+  // differences in pexp and exp.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1);
+
+
+  // y_i, and k_i are valid for x > 0.
+  {
+    const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5);
+    for (int i=0; i<size; ++i)
+    {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+    }
+  }
+
+  // TODO(srvasude): Re-enable this test once properly investigated why the
+  // scalar and vector paths differ.
+  // CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y0, internal::pbessel_y0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y1, internal::pbessel_y1);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0e, internal::pbessel_k0e);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1e, internal::pbessel_k1e);
+
+  // Following #1693, we restrict the range for exp to avoid zeroing out too
+  // fast.
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0);
+  CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1);
+
+
+  for (int i=0; i<size; ++i) {
+      data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+      data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+                  Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+  }
+
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+}
+
+namespace Eigen {
+namespace test {
+
+template<typename Scalar,typename PacketType, bool IsComplex, bool IsInteger>
+struct runall {
+  static void run() {
+    packetmath_real<Scalar,PacketType>();
+  }
+};
+
+}
+}
+
+EIGEN_DECLARE_TEST(special_packetmath)
+{
+  g_first_pass = true;
+  for(int i = 0; i < g_repeat; i++) {
+
+    CALL_SUBTEST_1( test::runner<float>::run() );
+    CALL_SUBTEST_2( test::runner<double>::run() );
+    CALL_SUBTEST_3( test::runner<Eigen::half>::run() );
+    CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() );
+    g_first_pass = false;
+  }
+}
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index 3be020434..88ec87b97 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d()
   }
 }
 
-void test_splines()
+EIGEN_DECLARE_TEST(splines)
 {
   for (int i = 0; i < g_repeat; ++i)
   {
author	Yi Kong <yikong@google.com>	2022-02-25 17:02:53 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2022-02-25 17:02:53 +0000
commit	edb0ad5bb04b48aab7dd0978f0475edd3550de7c (patch)
tree	fb979fb4cf4f8052c8cc66b1ec9516d91fcd859b /unsupported/test
parent	8fd413e275f78a4c240f1442ce5cf77c73a20a55 (diff)
parent	bc0f5df265caa21a2120c22453655a7fcc941991 (diff)
download	eigen-edb0ad5bb04b48aab7dd0978f0475edd3550de7c.tar.gz