aboutsummaryrefslogtreecommitdiff
path: root/unsupported/test
diff options
context:
space:
mode:
authorYi Kong <yikong@google.com>2022-02-25 17:02:53 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2022-02-25 17:02:53 +0000
commitedb0ad5bb04b48aab7dd0978f0475edd3550de7c (patch)
treefb979fb4cf4f8052c8cc66b1ec9516d91fcd859b /unsupported/test
parent8fd413e275f78a4c240f1442ce5cf77c73a20a55 (diff)
parentbc0f5df265caa21a2120c22453655a7fcc941991 (diff)
downloadeigen-edb0ad5bb04b48aab7dd0978f0475edd3550de7c.tar.gz
Original change: https://android-review.googlesource.com/c/platform/external/eigen/+/1999079 Change-Id: Ife39d10c8b23d3eeb174cd52f462f9d20527ad03
Diffstat (limited to 'unsupported/test')
-rw-r--r--unsupported/test/BVH.cpp2
-rw-r--r--unsupported/test/CMakeLists.txt378
-rw-r--r--unsupported/test/EulerAngles.cpp308
-rw-r--r--unsupported/test/FFTW.cpp2
-rw-r--r--unsupported/test/NonLinearOptimization.cpp123
-rw-r--r--unsupported/test/NumericalDiff.cpp4
-rw-r--r--unsupported/test/alignedvector3.cpp5
-rw-r--r--unsupported/test/autodiff.cpp42
-rw-r--r--unsupported/test/autodiff_scalar.cpp5
-rw-r--r--unsupported/test/bessel_functions.cpp370
-rw-r--r--unsupported/test/cxx11_eventcount.cpp12
-rw-r--r--unsupported/test/cxx11_maxsizevector.cpp77
-rw-r--r--unsupported/test/cxx11_meta.cpp2
-rw-r--r--unsupported/test/cxx11_non_blocking_thread_pool.cpp83
-rw-r--r--unsupported/test/cxx11_runqueue.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_argmax.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_argmax_gpu.cu (renamed from unsupported/test/cxx11_tensor_argmax_cuda.cu)93
-rw-r--r--unsupported/test/cxx11_tensor_argmax_sycl.cpp258
-rw-r--r--unsupported/test/cxx11_tensor_assign.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_block_access.cpp576
-rw-r--r--unsupported/test/cxx11_tensor_block_eval.cpp858
-rw-r--r--unsupported/test/cxx11_tensor_block_io.cpp445
-rw-r--r--unsupported/test/cxx11_tensor_broadcast_sycl.cpp118
-rw-r--r--unsupported/test/cxx11_tensor_broadcasting.cpp141
-rw-r--r--unsupported/test/cxx11_tensor_builtins_sycl.cpp354
-rw-r--r--unsupported/test/cxx11_tensor_cast_float16_gpu.cu (renamed from unsupported/test/cxx11_tensor_cast_float16_cuda.cu)13
-rw-r--r--unsupported/test/cxx11_tensor_casts.cpp83
-rw-r--r--unsupported/test/cxx11_tensor_chipping.cpp10
-rw-r--r--unsupported/test/cxx11_tensor_chipping_sycl.cpp623
-rw-r--r--unsupported/test/cxx11_tensor_comparisons.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu (renamed from unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu)21
-rw-r--r--unsupported/test/cxx11_tensor_complex_gpu.cu (renamed from unsupported/test/cxx11_tensor_complex_cuda.cu)49
-rw-r--r--unsupported/test/cxx11_tensor_concatenation.cpp10
-rw-r--r--unsupported/test/cxx11_tensor_concatenation_sycl.cpp180
-rw-r--r--unsupported/test/cxx11_tensor_const.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_contract_gpu.cu (renamed from unsupported/test/cxx11_tensor_contract_cuda.cu)96
-rw-r--r--unsupported/test/cxx11_tensor_contract_sycl.cpp1026
-rw-r--r--unsupported/test/cxx11_tensor_contraction.cpp120
-rw-r--r--unsupported/test/cxx11_tensor_convolution.cpp5
-rw-r--r--unsupported/test/cxx11_tensor_convolution_sycl.cpp469
-rw-r--r--unsupported/test/cxx11_tensor_custom_index.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_custom_op.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_custom_op_sycl.cpp170
-rw-r--r--unsupported/test/cxx11_tensor_device.cu66
-rw-r--r--unsupported/test/cxx11_tensor_device_sycl.cpp64
-rw-r--r--unsupported/test/cxx11_tensor_dimension.cpp21
-rw-r--r--unsupported/test/cxx11_tensor_empty.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp731
-rw-r--r--unsupported/test/cxx11_tensor_expr.cpp180
-rw-r--r--unsupported/test/cxx11_tensor_fft.cpp33
-rw-r--r--unsupported/test/cxx11_tensor_fixed_size.cpp4
-rw-r--r--unsupported/test/cxx11_tensor_forced_eval.cpp4
-rw-r--r--unsupported/test/cxx11_tensor_forced_eval_sycl.cpp63
-rw-r--r--unsupported/test/cxx11_tensor_generator.cpp8
-rw-r--r--unsupported/test/cxx11_tensor_generator_sycl.cpp147
-rw-r--r--unsupported/test/cxx11_tensor_gpu.cu (renamed from unsupported/test/cxx11_tensor_cuda.cu)958
-rw-r--r--unsupported/test/cxx11_tensor_ifft.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_image_op_sycl.cpp103
-rw-r--r--unsupported/test/cxx11_tensor_image_patch.cpp54
-rw-r--r--unsupported/test/cxx11_tensor_image_patch_sycl.cpp1092
-rw-r--r--unsupported/test/cxx11_tensor_index_list.cpp35
-rw-r--r--unsupported/test/cxx11_tensor_inflation.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_inflation_sycl.cpp136
-rw-r--r--unsupported/test/cxx11_tensor_intdiv.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_io.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_layout_swap.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_layout_swap_sycl.cpp126
-rw-r--r--unsupported/test/cxx11_tensor_lvalue.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_map.cpp68
-rw-r--r--unsupported/test/cxx11_tensor_math.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_math_sycl.cpp105
-rw-r--r--unsupported/test/cxx11_tensor_mixed_indices.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_morphing.cpp198
-rw-r--r--unsupported/test/cxx11_tensor_morphing_sycl.cpp386
-rw-r--r--unsupported/test/cxx11_tensor_move.cpp76
-rw-r--r--unsupported/test/cxx11_tensor_notification.cpp39
-rw-r--r--unsupported/test/cxx11_tensor_of_complex.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_of_const_values.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_of_float16_gpu.cu (renamed from unsupported/test/cxx11_tensor_of_float16_cuda.cu)144
-rw-r--r--unsupported/test/cxx11_tensor_of_strings.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_padding.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_padding_sycl.cpp157
-rw-r--r--unsupported/test/cxx11_tensor_patch.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_patch_sycl.cpp249
-rw-r--r--unsupported/test/cxx11_tensor_random.cpp20
-rw-r--r--unsupported/test/cxx11_tensor_random_gpu.cu (renamed from unsupported/test/cxx11_tensor_random_cuda.cu)34
-rw-r--r--unsupported/test/cxx11_tensor_random_sycl.cpp100
-rw-r--r--unsupported/test/cxx11_tensor_reduction.cpp88
-rw-r--r--unsupported/test/cxx11_tensor_reduction_gpu.cu (renamed from unsupported/test/cxx11_tensor_reduction_cuda.cu)13
-rw-r--r--unsupported/test/cxx11_tensor_reduction_sycl.cpp1002
-rw-r--r--unsupported/test/cxx11_tensor_ref.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_reverse.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_reverse_sycl.cpp253
-rw-r--r--unsupported/test/cxx11_tensor_roundings.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_scan.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_scan_gpu.cu (renamed from unsupported/test/cxx11_tensor_scan_cuda.cu)29
-rw-r--r--unsupported/test/cxx11_tensor_scan_sycl.cpp141
-rw-r--r--unsupported/test/cxx11_tensor_shuffling.cpp67
-rw-r--r--unsupported/test/cxx11_tensor_shuffling_sycl.cpp117
-rw-r--r--unsupported/test/cxx11_tensor_simple.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_striding.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_striding_sycl.cpp203
-rw-r--r--unsupported/test/cxx11_tensor_sugar.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_sycl.cpp308
-rw-r--r--unsupported/test/cxx11_tensor_symmetry.cpp2
-rw-r--r--unsupported/test/cxx11_tensor_thread_local.cpp149
-rw-r--r--unsupported/test/cxx11_tensor_thread_pool.cpp390
-rw-r--r--unsupported/test/cxx11_tensor_trace.cpp172
-rw-r--r--unsupported/test/cxx11_tensor_uint128.cpp4
-rw-r--r--unsupported/test/cxx11_tensor_volume_patch.cpp8
-rw-r--r--unsupported/test/cxx11_tensor_volume_patch_sycl.cpp222
-rw-r--r--unsupported/test/dgmres.cpp4
-rw-r--r--unsupported/test/forward_adolc.cpp6
-rw-r--r--unsupported/test/gmres.cpp2
-rw-r--r--unsupported/test/idrs.cpp27
-rw-r--r--unsupported/test/kronecker_product.cpp28
-rw-r--r--unsupported/test/levenberg_marquardt.cpp2
-rw-r--r--unsupported/test/matrix_exponential.cpp2
-rw-r--r--unsupported/test/matrix_function.cpp52
-rw-r--r--unsupported/test/matrix_power.cpp42
-rw-r--r--unsupported/test/matrix_square_root.cpp2
-rw-r--r--unsupported/test/minres.cpp2
-rw-r--r--unsupported/test/mpreal/mpreal.h3104
-rw-r--r--unsupported/test/mpreal_support.cpp3
-rw-r--r--unsupported/test/openglsupport.cpp639
-rw-r--r--unsupported/test/polynomialsolver.cpp58
-rw-r--r--unsupported/test/polynomialutils.cpp2
-rw-r--r--unsupported/test/sparse_extra.cpp91
-rw-r--r--unsupported/test/special_functions.cpp234
-rw-r--r--unsupported/test/special_packetmath.cpp149
-rw-r--r--unsupported/test/splines.cpp2
131 files changed, 15434 insertions, 4773 deletions
diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp
index ff5b3299d..d8c39d556 100644
--- a/unsupported/test/BVH.cpp
+++ b/unsupported/test/BVH.cpp
@@ -192,7 +192,7 @@ struct TreeTest
};
-void test_BVH()
+EIGEN_DECLARE_TEST(BVH)
{
for(int i = 0; i < g_repeat; i++) {
#ifdef EIGEN_TEST_PART_1
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index b5fa1c845..d30fa62bd 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -1,16 +1,7 @@
-# generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
- file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
- foreach(i RANGE 1 999)
- file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
- "#ifdef EIGEN_TEST_PART_${i}\n"
- "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
- "#else\n"
- "#define CALL_SUBTEST_${i}(FUNC)\n"
- "#endif\n\n"
- )
- endforeach()
+# The file split_test_helper.h was generated at first run,
+# it is now included in test/
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
endif()
set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
@@ -22,22 +13,27 @@ include_directories(../../test ../../unsupported ../../Eigen
find_package (Threads)
find_package(GoogleHash)
-if(GOOGLEHASH_FOUND)
+if(GoogleHash_FOUND)
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
include_directories(${GOOGLEHASH_INCLUDES})
ei_add_property(EIGEN_TESTED_BACKENDS "GoogleHash, ")
-else(GOOGLEHASH_FOUND)
+else()
ei_add_property(EIGEN_MISSING_BACKENDS "GoogleHash, ")
-endif(GOOGLEHASH_FOUND)
+endif()
+
find_package(Adolc)
-if(ADOLC_FOUND)
+if(Adolc_FOUND)
include_directories(${ADOLC_INCLUDES})
ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
- ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
-else(ADOLC_FOUND)
+ if(EIGEN_TEST_CXX11)
+ ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+ else()
+ message(STATUS "Adolc found, but tests require C++11 mode")
+ endif()
+else()
ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
-endif(ADOLC_FOUND)
+endif()
# this test seems to never have been successful on x87, so is considered to contain a FP-related bug.
# see thread: "non-linear optimization test summary"
@@ -47,9 +43,7 @@ ei_add_test(NumericalDiff)
ei_add_test(autodiff_scalar)
ei_add_test(autodiff)
-if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
ei_add_test(BVH)
-endif()
ei_add_test(matrix_exponential)
ei_add_test(matrix_function)
@@ -61,13 +55,11 @@ ei_add_test(FFT)
ei_add_test(EulerAngles)
-find_package(MPFR 2.3.0)
-find_package(GMP)
-if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
- include_directories(${MPFR_INCLUDES} ./mpreal)
+find_package(MPREAL)
+if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
- set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
- ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+ include_directories(${MPREAL_INCLUDES})
+ ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" )
else()
ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
endif()
@@ -87,8 +79,8 @@ else()
ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
endif()
-option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF)
-if(NOT EIGEN_TEST_NO_OPENGL)
+option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF)
+if(EIGEN_TEST_OPENGL)
find_package(OpenGL)
find_package(GLUT)
find_package(GLEW)
@@ -108,89 +100,192 @@ ei_add_test(polynomialsolver)
ei_add_test(polynomialutils)
ei_add_test(splines)
ei_add_test(gmres)
+ei_add_test(dgmres)
ei_add_test(minres)
+ei_add_test(idrs)
ei_add_test(levenberg_marquardt)
ei_add_test(kronecker_product)
+ei_add_test(bessel_functions)
ei_add_test(special_functions)
-
-# TODO: The following test names are prefixed with the cxx11 string, since historically
-# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
-# when using visual studio. We should make the check more strict to enable the tests for
-# newer versions of MSVC.
-if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-ei_add_test(cxx11_tensor_dimension)
-ei_add_test(cxx11_tensor_map)
-ei_add_test(cxx11_tensor_assign)
-ei_add_test(cxx11_tensor_comparisons)
-ei_add_test(cxx11_tensor_forced_eval)
-ei_add_test(cxx11_tensor_math)
-ei_add_test(cxx11_tensor_const)
-ei_add_test(cxx11_tensor_intdiv)
-ei_add_test(cxx11_tensor_casts)
-ei_add_test(cxx11_tensor_empty)
-ei_add_test(cxx11_tensor_sugar)
-ei_add_test(cxx11_tensor_roundings)
-ei_add_test(cxx11_tensor_layout_swap)
-ei_add_test(cxx11_tensor_io)
-if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
- # This test requires __uint128_t which is only available on 64bit systems
- ei_add_test(cxx11_tensor_uint128)
-endif()
-endif()
+ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
if(EIGEN_TEST_CXX11)
if(EIGEN_TEST_SYCL)
- ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
- endif(EIGEN_TEST_SYCL)
- # It should be safe to always run these tests as there is some fallback code for
- # older compiler that don't support cxx11.
- set(CMAKE_CXX_STANDARD 11)
+ set(EIGEN_SYCL ON)
+ # Forward CMake options as preprocessor definitions
+ if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
+ add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
+ endif()
+ if(EIGEN_SYCL_NO_LOCAL_MEM)
+ add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM})
+ endif()
+ if(EIGEN_SYCL_LOCAL_MEM)
+ add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM})
+ endif()
+ if(EIGEN_SYCL_MAX_GLOBAL_RANGE)
+ add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE})
+ endif()
+ if(EIGEN_SYCL_LOCAL_THREAD_DIM0)
+ add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0})
+ endif()
+ if(EIGEN_SYCL_LOCAL_THREAD_DIM1)
+ add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1})
+ endif()
+ if(EIGEN_SYCL_REG_M)
+ add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M})
+ endif()
+ if(EIGEN_SYCL_REG_N)
+ add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N})
+ endif()
+ if(EIGEN_SYCL_USE_PROGRAM_CLASS)
+ add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS})
+ endif()
+ if(EIGEN_SYCL_ASYNC_EXECUTION)
+ add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION})
+ endif()
+ if(EIGEN_SYCL_DISABLE_SKINNY)
+ add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY})
+ endif()
+ if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER)
+ add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER})
+ endif()
+ if(EIGEN_SYCL_DISABLE_RANK1)
+ add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1})
+ endif()
+ if(EIGEN_SYCL_DISABLE_SCALAR)
+ add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR})
+ endif()
+ if(EIGEN_SYCL_DISABLE_GEMV)
+ add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV})
+ endif()
+ if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION)
+ add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
+ endif()
+
+ if(EIGEN_SYCL_TRISYCL)
+ # triSYCL now requires c++17.
+ set(CMAKE_CXX_STANDARD 17)
+ else()
+ if(MSVC)
+ # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
+ # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers.
+ set(CMAKE_CXX_STANDARD 14)
+ list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
+ else()
+ set(CMAKE_CXX_STANDARD 11)
+ list(APPEND COMPUTECPP_USER_FLAGS -Wall)
+ endif()
+ # The following flags are not supported by Clang and can cause warnings
+ # if used with -Werror so they are removed here.
+ if(COMPUTECPP_USE_COMPILER_DRIVER)
+ set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE})
+ string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+ string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+ string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+ endif()
+ list(APPEND COMPUTECPP_USER_FLAGS
+ -DEIGEN_NO_ASSERTION_CHECKING=1
+ -no-serial-memop
+ -Xclang
+ -cl-mad-enable)
+ endif()
+
+ ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG})
+ ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG})
+ set(EIGEN_SYCL OFF)
+ endif()
ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_meta)
- ei_add_test(cxx11_tensor_simple)
-# ei_add_test(cxx11_tensor_symmetry)
- ei_add_test(cxx11_tensor_index_list)
- ei_add_test(cxx11_tensor_mixed_indices)
+ ei_add_test(cxx11_maxsizevector)
+ ei_add_test(cxx11_tensor_argmax)
+ ei_add_test(cxx11_tensor_assign)
+ ei_add_test(cxx11_tensor_block_access)
+ ei_add_test(cxx11_tensor_block_eval)
+ ei_add_test(cxx11_tensor_block_io)
+ ei_add_test(cxx11_tensor_broadcasting)
+ ei_add_test(cxx11_tensor_casts)
+ ei_add_test(cxx11_tensor_chipping)
+ ei_add_test(cxx11_tensor_comparisons)
+ ei_add_test(cxx11_tensor_concatenation)
+ ei_add_test(cxx11_tensor_const)
ei_add_test(cxx11_tensor_contraction)
ei_add_test(cxx11_tensor_convolution)
+ ei_add_test(cxx11_tensor_custom_index)
+ ei_add_test(cxx11_tensor_custom_op)
+ ei_add_test(cxx11_tensor_dimension)
+ ei_add_test(cxx11_tensor_empty)
+ ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_expr)
+ ei_add_test(cxx11_tensor_fft)
ei_add_test(cxx11_tensor_fixed_size)
- ei_add_test(cxx11_tensor_of_const_values)
- ei_add_test(cxx11_tensor_of_complex)
- ei_add_test(cxx11_tensor_of_strings)
- ei_add_test(cxx11_tensor_lvalue)
- ei_add_test(cxx11_tensor_broadcasting)
- ei_add_test(cxx11_tensor_chipping)
- ei_add_test(cxx11_tensor_concatenation)
+ ei_add_test(cxx11_tensor_forced_eval)
+ ei_add_test(cxx11_tensor_generator)
+ ei_add_test(cxx11_tensor_ifft)
+ ei_add_test(cxx11_tensor_image_patch)
+ ei_add_test(cxx11_tensor_index_list)
ei_add_test(cxx11_tensor_inflation)
+ ei_add_test(cxx11_tensor_intdiv)
+ ei_add_test(cxx11_tensor_io)
+ ei_add_test(cxx11_tensor_layout_swap)
+ ei_add_test(cxx11_tensor_lvalue)
+ ei_add_test(cxx11_tensor_map)
+ ei_add_test(cxx11_tensor_math)
+ ei_add_test(cxx11_tensor_mixed_indices)
ei_add_test(cxx11_tensor_morphing)
+ ei_add_test(cxx11_tensor_move)
+ ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+ ei_add_test(cxx11_tensor_of_complex)
+ ei_add_test(cxx11_tensor_of_const_values)
+ ei_add_test(cxx11_tensor_of_strings)
ei_add_test(cxx11_tensor_padding)
ei_add_test(cxx11_tensor_patch)
- ei_add_test(cxx11_tensor_image_patch)
- ei_add_test(cxx11_tensor_volume_patch)
+ ei_add_test(cxx11_tensor_random)
ei_add_test(cxx11_tensor_reduction)
- ei_add_test(cxx11_tensor_argmax)
+ ei_add_test(cxx11_tensor_ref)
+ ei_add_test(cxx11_tensor_roundings)
+ ei_add_test(cxx11_tensor_scan)
ei_add_test(cxx11_tensor_shuffling)
+ ei_add_test(cxx11_tensor_simple)
ei_add_test(cxx11_tensor_striding)
- ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+ ei_add_test(cxx11_tensor_sugar)
+ ei_add_test(cxx11_tensor_thread_local "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
- ei_add_test(cxx11_tensor_ref)
- ei_add_test(cxx11_tensor_random)
- ei_add_test(cxx11_tensor_generator)
- ei_add_test(cxx11_tensor_custom_op)
- ei_add_test(cxx11_tensor_custom_index)
- ei_add_test(cxx11_tensor_fft)
- ei_add_test(cxx11_tensor_ifft)
- ei_add_test(cxx11_tensor_scan)
+ ei_add_test(cxx11_tensor_trace)
+ ei_add_test(cxx11_tensor_volume_patch)
+# ei_add_test(cxx11_tensor_symmetry)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+ # This test requires __uint128_t which is only available on 64bit systems
+ ei_add_test(cxx11_tensor_uint128)
+endif()
endif()
@@ -213,7 +308,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
endif()
if(EIGEN_TEST_CUDA_CLANG)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+ string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+ foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+ string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}")
+ endforeach()
endif()
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
@@ -221,37 +320,98 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
endif()
- if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
- set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
- else()
- # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
- set(EIGEN_CUDA_CXX11_FLAG "")
- endif()
-
- set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
+ set(NVCC_ARCH_FLAGS)
+ foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+ string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+ endforeach()
+ set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
- ei_add_test(cxx11_tensor_complex_cuda)
- ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
- ei_add_test(cxx11_tensor_reduction_cuda)
- ei_add_test(cxx11_tensor_argmax_cuda)
- ei_add_test(cxx11_tensor_cast_float16_cuda)
- ei_add_test(cxx11_tensor_scan_cuda)
+ ei_add_test(cxx11_tensor_complex_gpu)
+ ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+ ei_add_test(cxx11_tensor_reduction_gpu)
+ ei_add_test(cxx11_tensor_argmax_gpu)
+ ei_add_test(cxx11_tensor_cast_float16_gpu)
+ ei_add_test(cxx11_tensor_scan_gpu)
+
+ set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
+ foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+ if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
+ set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
+ endif()
+ endforeach()
# Contractions require arch 3.0 or higher
- if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+ if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
ei_add_test(cxx11_tensor_device)
- ei_add_test(cxx11_tensor_cuda)
- ei_add_test(cxx11_tensor_contract_cuda)
- ei_add_test(cxx11_tensor_of_float16_cuda)
+ ei_add_test(cxx11_tensor_gpu)
+ ei_add_test(cxx11_tensor_contract_gpu)
+ ei_add_test(cxx11_tensor_of_float16_gpu)
endif()
# The random number generation code requires arch 3.5 or greater.
- if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
- ei_add_test(cxx11_tensor_random_cuda)
+ if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
+ ei_add_test(cxx11_tensor_random_gpu)
endif()
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
+
+# Add HIP specific tests
+if (EIGEN_TEST_HIP)
+
+ set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
+
+ if (EXISTS ${HIP_PATH})
+
+ list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
+
+ find_package(HIP REQUIRED)
+ if (HIP_FOUND)
+
+ execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
+
+ if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd"))
+
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+ include_directories(${HIP_PATH}/include)
+
+ set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+ #
+ # complex datatype is not yet supported by HIP
+ # so leaving out those tests for now
+ #
+ # ei_add_test(cxx11_tensor_complex_gpu)
+ # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
+ #
+ ei_add_test(cxx11_tensor_reduction_gpu)
+ ei_add_test(cxx11_tensor_argmax_gpu)
+ ei_add_test(cxx11_tensor_cast_float16_gpu)
+ ei_add_test(cxx11_tensor_scan_gpu)
+ ei_add_test(cxx11_tensor_device)
+
+ ei_add_test(cxx11_tensor_gpu)
+ ei_add_test(cxx11_tensor_contract_gpu)
+ ei_add_test(cxx11_tensor_of_float16_gpu)
+ ei_add_test(cxx11_tensor_random_gpu)
+
+ unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+ elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
+ message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
+ else ()
+ message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
+ endif()
+
+ endif()
+
+ else ()
+
+ message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
+
+ endif()
+
+endif()
+
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
index a8cb52864..0955795b6 100644
--- a/unsupported/test/EulerAngles.cpp
+++ b/unsupported/test/EulerAngles.cpp
@@ -13,146 +13,220 @@
using namespace Eigen;
-template<typename EulerSystem, typename Scalar>
-void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
- bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework)
+template <typename Scalar, class System>
+bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b)
+{
+ return verifyIsApprox(a.angles(), b.angles());
+}
+
+// Verify that x is in the approxed range [a, b]
+#define VERIFY_APPROXED_RANGE(a, x, b) \
+ do { \
+ VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \
+ VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \
+ } while(0)
+
+const char X = EULER_X;
+const char Y = EULER_Y;
+const char Z = EULER_Z;
+
+template<typename Scalar, class EulerSystem>
+void verify_euler(const EulerAngles<Scalar, EulerSystem>& e)
{
typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
typedef Matrix<Scalar,3,3> Matrix3;
typedef Matrix<Scalar,3,1> Vector3;
typedef Quaternion<Scalar> QuaternionType;
typedef AngleAxis<Scalar> AngleAxisType;
- using std::abs;
-
- Scalar alphaRangeStart, alphaRangeEnd;
- Scalar betaRangeStart, betaRangeEnd;
- Scalar gammaRangeStart, gammaRangeEnd;
- if (positiveRangeAlpha)
- {
- alphaRangeStart = Scalar(0);
- alphaRangeEnd = Scalar(2 * EIGEN_PI);
- }
- else
- {
- alphaRangeStart = -Scalar(EIGEN_PI);
- alphaRangeEnd = Scalar(EIGEN_PI);
- }
+ const Scalar ONE = Scalar(1);
+ const Scalar HALF_PI = Scalar(EIGEN_PI / 2);
+ const Scalar PI = Scalar(EIGEN_PI);
- if (positiveRangeBeta)
- {
- betaRangeStart = Scalar(0);
- betaRangeEnd = Scalar(2 * EIGEN_PI);
- }
- else
- {
- betaRangeStart = -Scalar(EIGEN_PI);
- betaRangeEnd = Scalar(EIGEN_PI);
- }
+ // It's very important calc the acceptable precision depending on the distance from the pole.
+ const Scalar longitudeRadius = std::abs(
+ EulerSystem::IsTaitBryan ?
+ std::cos(e.beta()) :
+ std::sin(e.beta())
+ );
+ Scalar precision = test_precision<Scalar>() / longitudeRadius;
- if (positiveRangeGamma)
+ Scalar betaRangeStart, betaRangeEnd;
+ if (EulerSystem::IsTaitBryan)
{
- gammaRangeStart = Scalar(0);
- gammaRangeEnd = Scalar(2 * EIGEN_PI);
+ betaRangeStart = -HALF_PI;
+ betaRangeEnd = HALF_PI;
}
else
{
- gammaRangeStart = -Scalar(EIGEN_PI);
- gammaRangeEnd = Scalar(EIGEN_PI);
+ if (!EulerSystem::IsBetaOpposite)
+ {
+ betaRangeStart = 0;
+ betaRangeEnd = PI;
+ }
+ else
+ {
+ betaRangeStart = -PI;
+ betaRangeEnd = 0;
+ }
}
- const int i = EulerSystem::AlphaAxisAbs - 1;
- const int j = EulerSystem::BetaAxisAbs - 1;
- const int k = EulerSystem::GammaAxisAbs - 1;
+ const Vector3 I_ = EulerAnglesType::AlphaAxisVector();
+ const Vector3 J_ = EulerAnglesType::BetaAxisVector();
+ const Vector3 K_ = EulerAnglesType::GammaAxisVector();
- const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
- const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
- const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
-
- const Vector3 I = EulerAnglesType::AlphaAxisVector();
- const Vector3 J = EulerAnglesType::BetaAxisVector();
- const Vector3 K = EulerAnglesType::GammaAxisVector();
-
- EulerAnglesType e(ea[0], ea[1], ea[2]);
+ // Is approx checks
+ VERIFY(e.isApprox(e));
+ VERIFY_IS_APPROX(e, e);
+ VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE));
+
+ const Matrix3 m(e);
+ VERIFY_IS_APPROX(Scalar(m.determinant()), ONE);
+
+ EulerAnglesType ebis(m);
- Matrix3 m(e);
- Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+ // When no roll(acting like polar representation), we have the best precision.
+ // One of those cases is when the Euler angles are on the pole, and because it's singular case,
+ // the computation returns no roll.
+ if (ebis.beta() == 0)
+ precision = test_precision<Scalar>();
// Check that eabis in range
- VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
- VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
- VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+ VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI);
+ VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd);
+ VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI);
+
+ const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_));
+ VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE);
+ VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix());
+ /*std::cout << "===================\n" <<
+ "e: " << e << std::endl <<
+ "eabis: " << eabis.transpose() << std::endl <<
+ "m: " << m << std::endl <<
+ "mbis: " << mbis << std::endl <<
+ "X: " << (m * Vector3::UnitX()).transpose() << std::endl <<
+ "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/
+ VERIFY(m.isApprox(mbis, precision));
+
+ // Test if ea and eabis are the same
+ // Need to check both singular and non-singular cases
+ // There are two singular cases.
+ // 1. When I==K and sin(ea(1)) == 0
+ // 2. When I!=K and cos(ea(1)) == 0
+
+ // TODO: Make this test work well, and use range saturation function.
+ /*// If I==K, and ea[1]==0, then there no unique solution.
+ // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2.
+ if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) )
+ VERIFY_IS_APPROX(ea, eabis);*/
- Vector3 eabis2 = m.eulerAngles(i, j, k);
+ // Quaternions
+ const QuaternionType q(e);
+ ebis = q;
+ const QuaternionType qbis(ebis);
+ VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision));
+ //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
- // Invert the relevant axes
- eabis2[0] *= iFactor;
- eabis2[1] *= jFactor;
- eabis2[2] *= kFactor;
+ // A suggestion for simple product test when will be supported.
+ /*EulerAnglesType e2(PI/2, PI/2, PI/2);
+ Matrix3 m2(e2);
+ VERIFY_IS_APPROX(e*e2, m*m2);*/
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_vec(const Matrix<Scalar,3,1>& ea)
+{
+ verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2]));
+}
+
+template<signed char A, signed char B, signed char C, typename Scalar>
+void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea)
+{
+ verify_euler_vec<+A,+B,+C>(ea);
+ verify_euler_vec<+A,+B,-C>(ea);
+ verify_euler_vec<+A,-B,+C>(ea);
+ verify_euler_vec<+A,-B,-C>(ea);
- // Saturate the angles to the correct range
- if (positiveRangeAlpha && (eabis2[0] < 0))
- eabis2[0] += Scalar(2 * EIGEN_PI);
- if (positiveRangeBeta && (eabis2[1] < 0))
- eabis2[1] += Scalar(2 * EIGEN_PI);
- if (positiveRangeGamma && (eabis2[2] < 0))
- eabis2[2] += Scalar(2 * EIGEN_PI);
+ verify_euler_vec<-A,+B,+C>(ea);
+ verify_euler_vec<-A,+B,-C>(ea);
+ verify_euler_vec<-A,-B,+C>(ea);
+ verify_euler_vec<-A,-B,-C>(ea);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+ verify_euler_all_neg<X,Y,Z>(ea);
+ verify_euler_all_neg<X,Y,X>(ea);
+ verify_euler_all_neg<X,Z,Y>(ea);
+ verify_euler_all_neg<X,Z,X>(ea);
- VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+ verify_euler_all_neg<Y,Z,X>(ea);
+ verify_euler_all_neg<Y,Z,Y>(ea);
+ verify_euler_all_neg<Y,X,Z>(ea);
+ verify_euler_all_neg<Y,X,Y>(ea);
- Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
- VERIFY_IS_APPROX(m, mbis);
+ verify_euler_all_neg<Z,X,Y>(ea);
+ verify_euler_all_neg<Z,X,Z>(ea);
+ verify_euler_all_neg<Z,Y,X>(ea);
+ verify_euler_all_neg<Z,Y,Z>(ea);
+}
+
+template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
+{
+ typedef Matrix<Scalar,3,1> Vector3;
+ const Scalar PI = Scalar(EIGEN_PI);
- // Tests that are only relevant for no possitive range
- if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+ for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2))
{
- /* If I==K, and ea[1]==0, then there no unique solution. */
- /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */
- if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) )
- VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
-
- // approx_or_less_than does not work for 0
- VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+ check_all_var(Vector3(PI/4, singularBeta, PI/3));
+ check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3));
+ check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3));
+ check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3));
+ check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI));
+ check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3)));
+ check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3)));
+ check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4)));
+ check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI));
}
- // Quaternions
- QuaternionType q(e);
- eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
- VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
-}
-
-template<typename EulerSystem, typename Scalar>
-void verify_euler(const Matrix<Scalar,3,1>& ea)
-{
- verify_euler_ranged<EulerSystem>(ea, false, false, false);
- verify_euler_ranged<EulerSystem>(ea, false, false, true);
- verify_euler_ranged<EulerSystem>(ea, false, true, false);
- verify_euler_ranged<EulerSystem>(ea, false, true, true);
- verify_euler_ranged<EulerSystem>(ea, true, false, false);
- verify_euler_ranged<EulerSystem>(ea, true, false, true);
- verify_euler_ranged<EulerSystem>(ea, true, true, false);
- verify_euler_ranged<EulerSystem>(ea, true, true, true);
+ // This one for sanity, it had a problem with near pole cases in float scalar.
+ check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI));
}
-template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+template<typename Scalar> void eulerangles_manual()
{
- verify_euler<EulerSystemXYZ>(ea);
- verify_euler<EulerSystemXYX>(ea);
- verify_euler<EulerSystemXZY>(ea);
- verify_euler<EulerSystemXZX>(ea);
-
- verify_euler<EulerSystemYZX>(ea);
- verify_euler<EulerSystemYZY>(ea);
- verify_euler<EulerSystemYXZ>(ea);
- verify_euler<EulerSystemYXY>(ea);
-
- verify_euler<EulerSystemZXY>(ea);
- verify_euler<EulerSystemZXZ>(ea);
- verify_euler<EulerSystemZYX>(ea);
- verify_euler<EulerSystemZYZ>(ea);
+ typedef Matrix<Scalar,3,1> Vector3;
+ typedef Matrix<Scalar,Dynamic,1> VectorX;
+ const Vector3 Zero = Vector3::Zero();
+ const Scalar PI = Scalar(EIGEN_PI);
+
+ check_all_var(Zero);
+
+ // singular cases
+ check_singular_cases(PI/2);
+ check_singular_cases(-PI/2);
+
+ check_singular_cases(Scalar(0));
+ check_singular_cases(Scalar(-0));
+
+ check_singular_cases(PI);
+ check_singular_cases(-PI);
+
+ // non-singular cases
+ VectorX alpha = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+ VectorX beta = VectorX::LinSpaced(20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
+ VectorX gamma = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI);
+ for (int i = 0; i < alpha.size(); ++i) {
+ for (int j = 0; j < beta.size(); ++j) {
+ for (int k = 0; k < gamma.size(); ++k) {
+ check_all_var(Vector3(alpha(i), beta(j), gamma(k)));
+ }
+ }
+ }
}
-template<typename Scalar> void eulerangles()
+template<typename Scalar> void eulerangles_rand()
{
typedef Matrix<Scalar,3,3> Matrix3;
typedef Matrix<Scalar,3,1> Vector3;
@@ -199,10 +273,24 @@ template<typename Scalar> void eulerangles()
check_all_var(ea);
}
-void test_EulerAngles()
+EIGEN_DECLARE_TEST(EulerAngles)
{
+ // Simple cast test
+ EulerAnglesXYZd onesEd(1, 1, 1);
+ EulerAnglesXYZf onesEf = onesEd.cast<float>();
+ VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
+
+ // Simple Construction from Vector3 test
+ VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones()));
+
+ CALL_SUBTEST_1( eulerangles_manual<float>() );
+ CALL_SUBTEST_2( eulerangles_manual<double>() );
+
for(int i = 0; i < g_repeat; i++) {
- CALL_SUBTEST_1( eulerangles<float>() );
- CALL_SUBTEST_2( eulerangles<double>() );
+ CALL_SUBTEST_3( eulerangles_rand<float>() );
+ CALL_SUBTEST_4( eulerangles_rand<double>() );
}
+
+ // TODO: Add tests for auto diff
+ // TODO: Add tests for complex numbers
}
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index 8b7528fb7..cfe559ebd 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -225,7 +225,7 @@ void test_return_by_value(int len)
VERIFY( (in1-in).norm() < test_precision<float>() );
}
-void test_FFTW()
+EIGEN_DECLARE_TEST(FFTW)
{
CALL_SUBTEST( test_return_by_value(32) );
//CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 1d682dd83..c667b7247 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -15,6 +15,15 @@
// tolerance for chekcing number of iterations
#define LM_EVAL_COUNT_TOL 4/3
+#define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \
+ ++g_test_level; \
+ VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \
+ VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \
+ --g_test_level; \
+ VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \
+ VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \
+ }
+
int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
{
/* subroutine fcn for chkder example. */
@@ -180,8 +189,7 @@ void testLmder1()
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 6);
- VERIFY_IS_EQUAL(lm.njev, 5);
+ LM_CHECK_N_ITERS(lm, 6, 5);
// check norm
VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -209,8 +217,7 @@ void testLmder()
// check return values
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 6);
- VERIFY_IS_EQUAL(lm.njev, 5);
+ LM_CHECK_N_ITERS(lm, 6, 5);
// check norm
fnorm = lm.fvec.blueNorm();
@@ -294,8 +301,7 @@ void testHybrj1()
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(solver.nfev, 11);
- VERIFY_IS_EQUAL(solver.njev, 1);
+ LM_CHECK_N_ITERS(solver, 11, 1);
// check norm
VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -329,8 +335,7 @@ void testHybrj()
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(solver.nfev, 11);
- VERIFY_IS_EQUAL(solver.njev, 1);
+ LM_CHECK_N_ITERS(solver, 11, 1);
// check norm
VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -485,8 +490,7 @@ void testLmstr1()
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 6);
- VERIFY_IS_EQUAL(lm.njev, 5);
+ LM_CHECK_N_ITERS(lm, 6, 5);
// check norm
VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -514,8 +518,7 @@ void testLmstr()
// check return values
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 6);
- VERIFY_IS_EQUAL(lm.njev, 5);
+ LM_CHECK_N_ITERS(lm, 6, 5);
// check norm
fnorm = lm.fvec.blueNorm();
@@ -565,7 +568,7 @@ void testLmdif1()
// do the computation
lmdif_functor functor;
- DenseIndex nfev;
+ DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning
info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
// check return value
@@ -686,8 +689,7 @@ void testNistChwirut2(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 10);
- VERIFY_IS_EQUAL(lm.njev, 8);
+ LM_CHECK_N_ITERS(lm, 10, 8);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
// check x
@@ -707,8 +709,7 @@ void testNistChwirut2(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 7);
- VERIFY_IS_EQUAL(lm.njev, 6);
+ LM_CHECK_N_ITERS(lm, 7, 6);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
// check x
@@ -766,8 +767,7 @@ void testNistMisra1a(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 19);
- VERIFY_IS_EQUAL(lm.njev, 15);
+ LM_CHECK_N_ITERS(lm, 19, 15);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
// check x
@@ -783,8 +783,7 @@ void testNistMisra1a(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 5);
- VERIFY_IS_EQUAL(lm.njev, 4);
+ LM_CHECK_N_ITERS(lm, 5, 4);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
// check x
@@ -856,8 +855,7 @@ void testNistHahn1(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 11);
- VERIFY_IS_EQUAL(lm.njev, 10);
+ LM_CHECK_N_ITERS(lm, 11, 10);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
// check x
@@ -878,8 +876,7 @@ void testNistHahn1(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 11);
- VERIFY_IS_EQUAL(lm.njev, 10);
+ LM_CHECK_N_ITERS(lm, 11, 10);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
// check x
@@ -942,8 +939,7 @@ void testNistMisra1d(void)
// check return value
VERIFY_IS_EQUAL(info, 3);
- VERIFY_IS_EQUAL(lm.nfev, 9);
- VERIFY_IS_EQUAL(lm.njev, 7);
+ LM_CHECK_N_ITERS(lm, 9, 7);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
// check x
@@ -959,8 +955,7 @@ void testNistMisra1d(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 4);
- VERIFY_IS_EQUAL(lm.njev, 3);
+ LM_CHECK_N_ITERS(lm, 4, 3);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
// check x
@@ -1020,8 +1015,7 @@ void testNistLanczos1(void)
// check return value
VERIFY_IS_EQUAL(info, 2);
- VERIFY_IS_EQUAL(lm.nfev, 79);
- VERIFY_IS_EQUAL(lm.njev, 72);
+ LM_CHECK_N_ITERS(lm, 79, 72);
// check norm^2
std::cout.precision(30);
std::cout << lm.fvec.squaredNorm() << "\n";
@@ -1043,8 +1037,7 @@ void testNistLanczos1(void)
// check return value
VERIFY_IS_EQUAL(info, 2);
- VERIFY_IS_EQUAL(lm.nfev, 9);
- VERIFY_IS_EQUAL(lm.njev, 8);
+ LM_CHECK_N_ITERS(lm, 9, 8);
// check norm^2
VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
// check x
@@ -1108,8 +1101,7 @@ void testNistRat42(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 10);
- VERIFY_IS_EQUAL(lm.njev, 8);
+ LM_CHECK_N_ITERS(lm, 10, 8);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
// check x
@@ -1126,8 +1118,7 @@ void testNistRat42(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 6);
- VERIFY_IS_EQUAL(lm.njev, 5);
+ LM_CHECK_N_ITERS(lm, 6, 5);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
// check x
@@ -1186,8 +1177,7 @@ void testNistMGH10(void)
// check return value
VERIFY_IS_EQUAL(info, 2);
- VERIFY_IS_EQUAL(lm.nfev, 284 );
- VERIFY_IS_EQUAL(lm.njev, 249 );
+ LM_CHECK_N_ITERS(lm, 284, 249);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
// check x
@@ -1204,8 +1194,7 @@ void testNistMGH10(void)
// check return value
VERIFY_IS_EQUAL(info, 3);
- VERIFY_IS_EQUAL(lm.nfev, 126);
- VERIFY_IS_EQUAL(lm.njev, 116);
+ LM_CHECK_N_ITERS(lm, 126, 116);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
// check x
@@ -1265,8 +1254,7 @@ void testNistBoxBOD(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY(lm.nfev < 31); // 31
- VERIFY(lm.njev < 25); // 25
+ LM_CHECK_N_ITERS(lm, 31, 25);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
// check x
@@ -1284,9 +1272,8 @@ void testNistBoxBOD(void)
info = lm.minimize(x);
// check return value
- VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 15 );
- VERIFY_IS_EQUAL(lm.njev, 14 );
+ VERIFY_IS_EQUAL(info, 1);
+ LM_CHECK_N_ITERS(lm, 15, 14);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
// check x
@@ -1356,12 +1343,7 @@ void testNistMGH17(void)
// check return value
VERIFY_IS_EQUAL(info, 2);
- ++g_test_level;
- VERIFY_IS_EQUAL(lm.nfev, 602); // 602
- VERIFY_IS_EQUAL(lm.njev, 545); // 545
- --g_test_level;
- VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL);
- VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL);
+ LM_CHECK_N_ITERS(lm, 602, 545);
/*
* Second try
@@ -1373,8 +1355,7 @@ void testNistMGH17(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 18);
- VERIFY_IS_EQUAL(lm.njev, 15);
+ LM_CHECK_N_ITERS(lm, 18, 15);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
// check x
@@ -1438,9 +1419,8 @@ void testNistMGH09(void)
info = lm.minimize(x);
// check return value
- VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 490 );
- VERIFY_IS_EQUAL(lm.njev, 376 );
+ VERIFY_IS_EQUAL(info, 1);
+ LM_CHECK_N_ITERS(lm, 490, 376);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
// check x
@@ -1459,8 +1439,7 @@ void testNistMGH09(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 18);
- VERIFY_IS_EQUAL(lm.njev, 16);
+ LM_CHECK_N_ITERS(lm, 18, 16);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
// check x
@@ -1525,8 +1504,7 @@ void testNistBennett5(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 758);
- VERIFY_IS_EQUAL(lm.njev, 744);
+ LM_CHECK_N_ITERS(lm, 758, 744);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
// check x
@@ -1543,8 +1521,7 @@ void testNistBennett5(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 203);
- VERIFY_IS_EQUAL(lm.njev, 192);
+ LM_CHECK_N_ITERS(lm, 203, 192);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
// check x
@@ -1613,8 +1590,7 @@ void testNistThurber(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 39);
- VERIFY_IS_EQUAL(lm.njev, 36);
+ LM_CHECK_N_ITERS(lm, 39,36);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
// check x
@@ -1638,8 +1614,7 @@ void testNistThurber(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 29);
- VERIFY_IS_EQUAL(lm.njev, 28);
+ LM_CHECK_N_ITERS(lm, 29, 28);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
// check x
@@ -1705,8 +1680,7 @@ void testNistRat43(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 27);
- VERIFY_IS_EQUAL(lm.njev, 20);
+ LM_CHECK_N_ITERS(lm, 27, 20);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
// check x
@@ -1727,8 +1701,7 @@ void testNistRat43(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 9);
- VERIFY_IS_EQUAL(lm.njev, 8);
+ LM_CHECK_N_ITERS(lm, 9, 8);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
// check x
@@ -1790,8 +1763,7 @@ void testNistEckerle4(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 18);
- VERIFY_IS_EQUAL(lm.njev, 15);
+ LM_CHECK_N_ITERS(lm, 18, 15);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
// check x
@@ -1808,8 +1780,7 @@ void testNistEckerle4(void)
// check return value
VERIFY_IS_EQUAL(info, 1);
- VERIFY_IS_EQUAL(lm.nfev, 7);
- VERIFY_IS_EQUAL(lm.njev, 6);
+ LM_CHECK_N_ITERS(lm, 7, 6);
// check norm^2
VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
// check x
@@ -1818,7 +1789,7 @@ void testNistEckerle4(void)
VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
}
-void test_NonLinearOptimization()
+EIGEN_DECLARE_TEST(NonLinearOptimization)
{
// Tests using the examples provided by (c)minpack
CALL_SUBTEST/*_1*/(testChkder());
diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp
index 27d888056..6d836413b 100644
--- a/unsupported/test/NumericalDiff.cpp
+++ b/unsupported/test/NumericalDiff.cpp
@@ -24,7 +24,7 @@ struct Functor
int m_inputs, m_values;
Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
- Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+ Functor(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
int inputs() const { return m_inputs; }
int values() const { return m_values; }
@@ -107,7 +107,7 @@ void test_central()
VERIFY_IS_APPROX(jac, actual_jac);
}
-void test_NumericalDiff()
+EIGEN_DECLARE_TEST(NumericalDiff)
{
CALL_SUBTEST(test_forward());
CALL_SUBTEST(test_central());
diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp
index 252cb1d3f..f442e416a 100644
--- a/unsupported/test/alignedvector3.cpp
+++ b/unsupported/test/alignedvector3.cpp
@@ -70,13 +70,16 @@ void alignedvector3()
VERIFY_IS_APPROX(f6,r1-r4);
}
+ FastType f8, f9(0,0,0);
+ VERIFY_IS_APPROX(f9-f1,-f1);
+
std::stringstream ss1, ss2;
ss1 << f1;
ss2 << r1;
VERIFY(ss1.str()==ss2.str());
}
-void test_alignedvector3()
+EIGEN_DECLARE_TEST(alignedvector3)
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST( alignedvector3<float>() );
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index 85743137e..2cea56ba5 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -44,7 +44,7 @@ struct TestFunc1
int m_inputs, m_values;
TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
- TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+ TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
int inputs() const { return m_inputs; }
int values() const { return m_values; }
@@ -306,6 +306,8 @@ double bug_1222() {
return denom.value();
}
+#ifdef EIGEN_TEST_PART_5
+
double bug_1223() {
using std::min;
typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
@@ -326,8 +328,8 @@ double bug_1223() {
// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
void bug_1260() {
- Matrix4d A;
- Vector4d v;
+ Matrix4d A = Matrix4d::Ones();
+ Vector4d v = Vector4d::Ones();
A*v;
}
@@ -336,7 +338,7 @@ double bug_1261() {
typedef AutoDiffScalar<Matrix2d> AD;
typedef Matrix<AD,2,1> VectorAD;
- VectorAD v;
+ VectorAD v(0.,0.);
const AD maxVal = v.maxCoeff();
const AD minVal = v.minCoeff();
return maxVal.value() + minVal.value();
@@ -344,13 +346,30 @@ double bug_1261() {
double bug_1264() {
typedef AutoDiffScalar<Vector2d> AD;
- const AD s;
- const Matrix<AD, 3, 1> v1;
+ const AD s = 0.;
+ const Matrix<AD, 3, 1> v1(0.,0.,0.);
const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
return v2(0).value();
}
-void test_autodiff()
+// check with expressions on constants
+double bug_1281() {
+ int n = 2;
+ typedef AutoDiffScalar<VectorXd> AD;
+ const AD c = 1.;
+ AD x0(2,n,0);
+ AD y1 = (AD(c)+AD(c))*x0;
+ y1 = x0 * (AD(c)+AD(c));
+ AD y2 = (-AD(c))+x0;
+ y2 = x0+(-AD(c));
+ AD y3 = (AD(c)*(-AD(c))+AD(c))*x0;
+ y3 = x0 * (AD(c)*(-AD(c))+AD(c));
+ return (y1+y2+y3).value();
+}
+
+#endif
+
+EIGEN_DECLARE_TEST(autodiff)
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( test_autodiff_scalar<1>() );
@@ -359,9 +378,10 @@ void test_autodiff()
CALL_SUBTEST_4( test_autodiff_hessian<1>() );
}
- bug_1222();
- bug_1223();
- bug_1260();
- bug_1261();
+ CALL_SUBTEST_5( bug_1222() );
+ CALL_SUBTEST_5( bug_1223() );
+ CALL_SUBTEST_5( bug_1260() );
+ CALL_SUBTEST_5( bug_1261() );
+ CALL_SUBTEST_5( bug_1281() );
}
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index 9cf11280c..e81a7788b 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -81,12 +81,15 @@ void check_limits_specialization()
typedef std::numeric_limits<AD> A;
typedef std::numeric_limits<Scalar> B;
+ // workaround "unused typedef" warning:
+ VERIFY(!bool(internal::is_same<B, A>::value));
+
#if EIGEN_HAS_CXX11
VERIFY(bool(std::is_base_of<B, A>::value));
#endif
}
-void test_autodiff_scalar()
+EIGEN_DECLARE_TEST(autodiff_scalar)
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( check_atan2<float>() );
diff --git a/unsupported/test/bessel_functions.cpp b/unsupported/test/bessel_functions.cpp
new file mode 100644
index 000000000..06765bfab
--- /dev/null
+++ b/unsupported/test/bessel_functions.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+ for(Index i=0; i<x.size(); ++i)
+ {
+ if((numext::isfinite)(y(i))) {
+ VERIFY_IS_APPROX( x(i), y(i) );
+ }
+ else if((numext::isnan)(y(i)))
+ VERIFY((numext::isnan)(x(i)));
+ else
+ VERIFY_IS_EQUAL( x(i), y(i) );
+ }
+}
+
+template<typename ArrayType> void array_bessel_functions()
+{
+ // Test Bessel function i0. Reference results obtained with SciPy.
+ {
+ ArrayType x(21);
+ ArrayType expected(21);
+ ArrayType res(21);
+
+ x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+ 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+ expected << 4.35582826e+07, 6.21841242e+06, 8.93446228e+05, 1.29418563e+05,
+ 1.89489253e+04, 2.81571663e+03, 4.27564116e+02, 6.72344070e+01,
+ 1.13019220e+01, 2.27958530e+00, 1.00000000e+00, 2.27958530e+00,
+ 1.13019220e+01, 6.72344070e+01, 4.27564116e+02, 2.81571663e+03,
+ 1.89489253e+04, 1.29418563e+05, 8.93446228e+05, 6.21841242e+06,
+ 4.35582826e+07;
+
+ CALL_SUBTEST(res = bessel_i0(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function i0e. Reference results obtained with SciPy.
+ {
+ ArrayType x(21);
+ ArrayType expected(21);
+ ArrayType res(21);
+
+ x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+ 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+ expected << 0.0897803118848, 0.0947062952128, 0.100544127361,
+ 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+ 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+ 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+ 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+ 0.0897803118848;
+
+ CALL_SUBTEST(res = bessel_i0e(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function i1. Reference results obtained with SciPy.
+ {
+ ArrayType x(21);
+ ArrayType expected(21);
+ ArrayType res(21);
+
+ x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+ 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+ expected << -4.24549734e+07, -6.04313324e+06, -8.65059436e+05, -1.24707259e+05,
+ -1.81413488e+04, -2.67098830e+03, -3.99873137e+02, -6.13419368e+01,
+ -9.75946515e+00, -1.59063685e+00, 0.00000000e+00, 1.59063685e+00,
+ 9.75946515e+00, 6.13419368e+01, 3.99873137e+02, 2.67098830e+03,
+ 1.81413488e+04, 1.24707259e+05, 8.65059436e+05, 6.04313324e+06,
+ 4.24549734e+07;
+
+ CALL_SUBTEST(res = bessel_i1(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function i1e. Reference results obtained with SciPy.
+ {
+ ArrayType x(21);
+ ArrayType expected(21);
+ ArrayType res(21);
+
+ x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0,
+ 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+ expected << -0.0875062221833, -0.092036796872, -0.0973496147565,
+ -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+ -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+ 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+ 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+ 0.0875062221833;
+
+ CALL_SUBTEST(res = bessel_i1e(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function j0. Reference results obtained with SciPy.
+ {
+ ArrayType x(77);
+ ArrayType expected(77);
+ ArrayType res(77);
+
+ x << -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+ -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+ -18., -17., -16., -15., -14., -13., -12., -11., -10., -9., -8.,
+ -7., -6., -5., -4., -3., -2., -1., 0., 1., 2., 3.,
+ 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+ 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36.,
+ 37., 38.;
+
+ expected << 0.11433274, 0.01086237, -0.10556738,
+ -0.12684568, -0.03042119, 0.09727067, 0.13807901, 0.05120815,
+ -0.08636798, -0.14784876, -0.07315701, 0.07274192, 0.15599932,
+ 0.09626678, -0.05623027, -0.16241278, -0.12065148, 0.03657907,
+ 0.16702466, 0.14662944, -0.01335581, -0.16985425, -0.17489907,
+ -0.01422447, 0.17107348, 0.2069261 , 0.04768931, -0.1711903 ,
+ -0.24593576, -0.09033361, 0.17165081, 0.30007927, 0.15064526,
+ -0.17759677, -0.39714981, -0.26005195, 0.22389078, 0.76519769,
+ 1. , 0.76519769, 0.22389078, -0.26005195, -0.39714981,
+ -0.17759677, 0.15064526, 0.30007927, 0.17165081, -0.09033361,
+ -0.24593576, -0.1711903 , 0.04768931, 0.2069261 , 0.17107348,
+ -0.01422447, -0.17489907, -0.16985425, -0.01335581, 0.14662944,
+ 0.16702466, 0.03657907, -0.12065148, -0.16241278, -0.05623027,
+ 0.09626678, 0.15599932, 0.07274192, -0.07315701, -0.14784876,
+ -0.08636798, 0.05120815, 0.13807901, 0.09727067, -0.03042119,
+ -0.12684568, -0.10556738, 0.01086237, 0.11433274;
+
+ CALL_SUBTEST(res = bessel_j0(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function j1. Reference results obtained with SciPy.
+ {
+ ArrayType x(81);
+ ArrayType expected(81);
+ ArrayType res(81);
+
+ x << -40., -39., -38., -37., -36., -35., -34., -33., -32., -31., -30.,
+ -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19.,
+ -18., -17., -16., -15., -14., -13., -12., -11., -10., -9., -8.,
+ -7., -6., -5., -4., -3., -2., -1., 0., 1., 2., 3.,
+ 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
+ 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36.,
+ 37., 38., 39., 40.;
+
+ expected << -0.12603832, -0.0640561 , 0.05916189, 0.13058004, 0.08232981,
+ -0.04399094, -0.13297118, -0.10061965, 0.02658903, 0.13302432,
+ 0.11875106, -0.0069342 , -0.13055149, -0.13658472, -0.01504573,
+ 0.12535025, 0.15403807, 0.03951932, -0.11717779, -0.17112027,
+ -0.06683312, 0.10570143, 0.18799489, 0.09766849, -0.09039718,
+ -0.20510404, -0.13337515, 0.07031805, 0.2234471 , 0.1767853 ,
+ -0.04347275, -0.24531179, -0.23463635, 0.00468282, 0.27668386,
+ 0.32757914, 0.06604333, -0.33905896, -0.57672481, -0.44005059,
+ 0. , 0.44005059, 0.57672481, 0.33905896, -0.06604333,
+ -0.32757914, -0.27668386, -0.00468282, 0.23463635, 0.24531179,
+ 0.04347275, -0.1767853 , -0.2234471 , -0.07031805, 0.13337515,
+ 0.20510404, 0.09039718, -0.09766849, -0.18799489, -0.10570143,
+ 0.06683312, 0.17112027, 0.11717779, -0.03951932, -0.15403807,
+ -0.12535025, 0.01504573, 0.13658472, 0.13055149, 0.0069342 ,
+ -0.11875106, -0.13302432, -0.02658903, 0.10061965, 0.13297118,
+ 0.04399094, -0.08232981, -0.13058004, -0.05916189, 0.0640561 ,
+ 0.12603832;
+
+ CALL_SUBTEST(res = bessel_j1(x);
+ verify_component_wise(res, expected););
+ }
+ // Test Bessel function k0e. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+ 0.6977616 , 0.60929767, 0.54780756, 0.50186313, 0.4658451 ,
+ 0.43662302, 0.41229555, 0.39163193, 0.3737955 , 0.35819488,
+ 0.34439865, 0.33208364, 0.32100235, 0.31096159, 0.30180802,
+ 0.29341821, 0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+ 0.25994703, 0.25452917, 0.2494366 , 0.24463801, 0.24010616,
+ 0.23581722, 0.23175022, 0.22788667, 0.22421014, 0.22070602,
+ 0.21736123, 0.21416406, 0.21110397, 0.20817141, 0.20535778,
+ 0.20265524, 0.20005668, 0.19755558;
+
+ CALL_SUBTEST(res = bessel_k0e(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function k0. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << 1.54150675, 0.92441907, 4.21024438e-01, 1.13893873e-01,
+ 3.47395044e-02, 1.11596761e-02, 3.69109833e-03, 1.24399433e-03,
+ 4.24795742e-04, 1.46470705e-04, 5.08813130e-05, 1.77800623e-05,
+ 6.24302055e-06, 2.20082540e-06, 7.78454386e-07, 2.76137082e-07,
+ 9.81953648e-08, 3.49941166e-08, 1.24946640e-08, 4.46875334e-09,
+ 1.60067129e-09, 5.74123782e-10, 2.06176797e-10, 7.41235161e-11,
+ 2.66754511e-11, 9.60881878e-12, 3.46416156e-12, 1.24987740e-12,
+ 4.51286453e-13, 1.63053459e-13, 5.89495073e-14, 2.13247750e-14,
+ 7.71838266e-15, 2.79505752e-15, 1.01266123e-15, 3.67057597e-16,
+ 1.33103515e-16, 4.82858338e-17, 1.75232770e-17, 6.36161716e-18,
+ 2.31029936e-18, 8.39286110e-19;
+
+ CALL_SUBTEST(res = bessel_k0(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function k0e. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822,
+ 0.6977616 , 0.60929767, 0.54780756, 0.50186313,
+ 0.4658451 , 0.43662302, 0.41229555, 0.39163193,
+ 0.3737955 , 0.35819488, 0.34439865, 0.33208364,
+ 0.32100235, 0.31096159, 0.30180802, 0.29341821,
+ 0.28569149, 0.27854488, 0.2719092 , 0.26572635,
+ 0.25994703, 0.25452917, 0.2494366 , 0.24463801,
+ 0.24010616, 0.23581722, 0.23175022, 0.22788667,
+ 0.22421014, 0.22070602, 0.21736123, 0.21416406,
+ 0.21110397, 0.20817141, 0.20535778, 0.20265524,
+ 0.20005668, 0.19755558;
+
+ CALL_SUBTEST(res = bessel_k0e(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function k1. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << 3.74702597, 1.65644112, 6.01907230e-01, 1.39865882e-01,
+ 4.01564311e-02, 1.24834989e-02, 4.04461345e-03, 1.34391972e-03,
+ 4.54182487e-04, 1.55369212e-04, 5.36370164e-05, 1.86487735e-05,
+ 6.52086067e-06, 2.29075746e-06, 8.07858841e-07, 2.85834365e-07,
+ 1.01417294e-07, 3.60715712e-08, 1.28570417e-08, 4.59124963e-09,
+ 1.64226697e-09, 5.88305797e-10, 2.11029922e-10, 7.57898116e-11,
+ 2.72493059e-11, 9.80699893e-12, 3.53277807e-12, 1.27369078e-12,
+ 4.59568940e-13, 1.65940011e-13, 5.99574032e-14, 2.16773200e-14,
+ 7.84189960e-15, 2.83839927e-15, 1.02789171e-15, 3.72416929e-16,
+ 1.34991783e-16, 4.89519373e-17, 1.77585196e-17, 6.44478588e-18,
+ 2.33973340e-18, 8.49713195e-19;
+
+ CALL_SUBTEST(res = bessel_k1(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function k1e. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << 4.81127659, 2.73100971, 1.63615349, 1.03347685,
+ 0.80656348, 0.68157595, 0.60027386, 0.54217591,
+ 0.49807158, 0.46314909, 0.43462525, 0.41076657,
+ 0.39043094, 0.37283175, 0.35740757, 0.34374563,
+ 0.33153489, 0.32053597, 0.31056123, 0.30146131,
+ 0.29311559, 0.2854255 , 0.27830958, 0.27169987,
+ 0.26553913, 0.25977879, 0.25437733, 0.249299 ,
+ 0.24451285, 0.23999191, 0.2357126 , 0.23165413,
+ 0.22779816, 0.22412841, 0.22063036, 0.21729103,
+ 0.21409878, 0.21104314, 0.20811462, 0.20530466,
+ 0.20260547, 0.20000997;
+
+ CALL_SUBTEST(res = bessel_k1e(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function y0. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << -0.93157302, -0.44451873, 0.08825696, 0.51037567, 0.37685001,
+ -0.01694074, -0.30851763, -0.28819468, -0.02594974, 0.22352149,
+ 0.2499367 , 0.05567117, -0.16884732, -0.22523731, -0.07820786,
+ 0.12719257, 0.2054643 , 0.095811 , -0.0926372 , -0.18755216,
+ -0.10951969, 0.0626406 , 0.17020176, 0.1198876 , -0.03598179,
+ -0.15283403, -0.12724943, 0.01204463, 0.13521498, 0.13183647,
+ 0.00948116, -0.11729573, -0.13383266, -0.02874248, 0.09913483,
+ 0.13340405, 0.04579799, -0.08085609, -0.13071488, -0.06066076,
+ 0.06262353, 0.12593642;
+
+ CALL_SUBTEST(res = bessel_y0(x);
+ verify_component_wise(res, expected););
+ }
+
+ // Test Bessel function y1. Reference results obtained with SciPy.
+ {
+ ArrayType x(42);
+ ArrayType expected(42);
+ ArrayType res(42);
+
+ x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.,
+ 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
+ 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
+ 39., 40.;
+
+ expected << -2.70410523, -1.47147239, -0.78121282, -0.10703243,
+ 0.32467442, 0.39792571, 0.14786314, -0.17501034, -0.30266724,
+ -0.15806046, 0.10431458, 0.24901542, 0.16370554, -0.05709922,
+ -0.21008141, -0.16664484, 0.02107363, 0.17797517, 0.16720504,
+ 0.00815513, -0.14956011, -0.16551161, -0.03253926, 0.12340586,
+ 0.1616692 , 0.05305978, -0.09882996, -0.15579655, -0.07025124,
+ 0.07552213, 0.14803412, 0.08442557, -0.05337283, -0.13854483,
+ -0.09578012, 0.03238588, 0.12751273, 0.10445477, -0.01262946,
+ -0.11514066, -0.11056411, -0.00579351;
+
+ CALL_SUBTEST(res = bessel_y1(x);
+ verify_component_wise(res, expected););
+ }
+}
+
+EIGEN_DECLARE_TEST(bessel_functions)
+{
+ CALL_SUBTEST_1(array_bessel_functions<ArrayXf>());
+ CALL_SUBTEST_2(array_bessel_functions<ArrayXd>());
+}
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index 3b598bf42..7bf4e965f 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -30,11 +30,11 @@ static void test_basic_eventcount()
EventCount ec(waiters);
EventCount::Waiter& w = waiters[0];
ec.Notify(false);
- ec.Prewait(&w);
+ ec.Prewait();
ec.Notify(true);
ec.CommitWait(&w);
- ec.Prewait(&w);
- ec.CancelWait(&w);
+ ec.Prewait();
+ ec.CancelWait();
}
// Fake bounded counter-based queue.
@@ -112,7 +112,7 @@ static void test_stress_eventcount()
unsigned idx = rand_reentrant(&rnd) % kQueues;
if (queues[idx].Pop()) continue;
j--;
- ec.Prewait(&w);
+ ec.Prewait();
bool empty = true;
for (int q = 0; q < kQueues; q++) {
if (!queues[q].Empty()) {
@@ -121,7 +121,7 @@ static void test_stress_eventcount()
}
}
if (!empty) {
- ec.CancelWait(&w);
+ ec.CancelWait();
continue;
}
ec.CommitWait(&w);
@@ -135,7 +135,7 @@ static void test_stress_eventcount()
}
}
-void test_cxx11_eventcount()
+EIGEN_DECLARE_TEST(cxx11_eventcount)
{
CALL_SUBTEST(test_basic_eventcount());
CALL_SUBTEST(test_stress_eventcount());
diff --git a/unsupported/test/cxx11_maxsizevector.cpp b/unsupported/test/cxx11_maxsizevector.cpp
new file mode 100644
index 000000000..46b689a8e
--- /dev/null
+++ b/unsupported/test/cxx11_maxsizevector.cpp
@@ -0,0 +1,77 @@
+#include "main.h"
+
+#include <exception> // std::exception
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+struct Foo
+{
+ static Index object_count;
+ static Index object_limit;
+ EIGEN_ALIGN_TO_BOUNDARY(128) int dummy;
+
+ Foo(int x=0) : dummy(x)
+ {
+#ifdef EIGEN_EXCEPTIONS
+ // TODO: Is this the correct way to handle this?
+ if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); }
+#endif
+ std::cout << '+';
+ ++Foo::object_count;
+ eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+ }
+ Foo(const Foo&)
+ {
+ std::cout << 'c';
+ ++Foo::object_count;
+ eigen_assert((internal::UIntPtr(this) & (127)) == 0);
+ }
+
+ ~Foo()
+ {
+ std::cout << '~';
+ --Foo::object_count;
+ }
+
+ class Fail : public std::exception {};
+};
+
+Index Foo::object_count = 0;
+Index Foo::object_limit = 0;
+
+
+
+EIGEN_DECLARE_TEST(cxx11_maxsizevector)
+{
+ typedef MaxSizeVector<Foo> VectorX;
+ Foo::object_count = 0;
+ for(int r = 0; r < g_repeat; r++) {
+ Index rows = internal::random<Index>(3,30);
+ Foo::object_limit = internal::random<Index>(0, rows - 2);
+ std::cout << "object_limit = " << Foo::object_limit << std::endl;
+ bool exception_raised = false;
+#ifdef EIGEN_EXCEPTIONS
+ try
+ {
+#endif
+ std::cout << "\nVectorX m(" << rows << ");\n";
+ VectorX vect(rows);
+ for(int i=0; i<rows; ++i)
+ vect.push_back(Foo());
+#ifdef EIGEN_EXCEPTIONS
+ VERIFY(false); // not reached if exceptions are enabled
+ }
+ catch (const Foo::Fail&) { exception_raised = true; }
+ VERIFY(exception_raised);
+#endif
+ VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+
+ {
+ Foo::object_limit = rows+1;
+ VectorX vect2(rows, Foo());
+ VERIFY_IS_EQUAL(Foo::object_count, rows);
+ }
+ VERIFY_IS_EQUAL(Index(0), Foo::object_count);
+ std::cout << '\n';
+ }
+}
diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp
index 8911c59d8..510e11032 100644
--- a/unsupported/test/cxx11_meta.cpp
+++ b/unsupported/test/cxx11_meta.cpp
@@ -340,7 +340,7 @@ static void test_array_misc()
VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
}
-void test_cxx11_meta()
+EIGEN_DECLARE_TEST(cxx11_meta)
{
CALL_SUBTEST(test_gen_numeric_list());
CALL_SUBTEST(test_concat());
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 5f9bb938b..993ee1789 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -11,22 +11,23 @@
#define EIGEN_USE_THREADS
#include "main.h"
#include "Eigen/CXX11/ThreadPool"
+#include "Eigen/CXX11/Tensor"
static void test_create_destroy_empty_pool()
{
// Just create and destroy the pool. This will wind up and tear down worker
// threads. Ensure there are no issues in that logic.
for (int i = 0; i < 16; ++i) {
- NonBlockingThreadPool tp(i);
+ ThreadPool tp(i);
}
}
-static void test_parallelism()
+static void test_parallelism(bool allow_spinning)
{
// Test we never-ever fail to match available tasks with idle threads.
const int kThreads = 16; // code below expects that this is a multiple of 4
- NonBlockingThreadPool tp(kThreads);
+ ThreadPool tp(kThreads, allow_spinning);
VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
for (int iter = 0; iter < 100; ++iter) {
@@ -100,8 +101,80 @@ static void test_parallelism()
}
}
-void test_cxx11_non_blocking_thread_pool()
+
+static void test_cancel()
+{
+ ThreadPool tp(2);
+
+ // Schedule a large number of closure that each sleeps for one second. This
+ // will keep the thread pool busy for much longer than the default test timeout.
+ for (int i = 0; i < 1000; ++i) {
+ tp.Schedule([]() {
+ std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+ });
+ }
+
+ // Cancel the processing of all the closures that are still pending.
+ tp.Cancel();
+}
+
+static void test_pool_partitions() {
+ const int kThreads = 2;
+ ThreadPool tp(kThreads);
+
+ // Assign each thread to its own partition, so that stealing other work only
+ // occurs globally when a thread is idle.
+ std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads);
+ for (int i = 0; i < kThreads; ++i) {
+ steal_partitions[i] = std::make_pair(i, i + 1);
+ }
+ tp.SetStealPartitions(steal_partitions);
+
+ std::atomic<int> running(0);
+ std::atomic<int> done(0);
+ std::atomic<int> phase(0);
+
+ // Schedule kThreads tasks and ensure that they all are running.
+ for (int i = 0; i < kThreads; ++i) {
+ tp.Schedule([&]() {
+ const int thread_id = tp.CurrentThreadId();
+ VERIFY_GE(thread_id, 0);
+ VERIFY_LE(thread_id, kThreads - 1);
+ ++running;
+ while (phase < 1) {
+ }
+ ++done;
+ });
+ }
+ while (running != kThreads) {
+ }
+ // Schedule each closure to only run on thread 'i' and verify that it does.
+ for (int i = 0; i < kThreads; ++i) {
+ tp.ScheduleWithHint(
+ [&, i]() {
+ ++running;
+ const int thread_id = tp.CurrentThreadId();
+ VERIFY_IS_EQUAL(thread_id, i);
+ while (phase < 2) {
+ }
+ ++done;
+ },
+ i, i + 1);
+ }
+ running = 0;
+ phase = 1;
+ while (running != kThreads) {
+ }
+ running = 0;
+ phase = 2;
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool)
{
CALL_SUBTEST(test_create_destroy_empty_pool());
- CALL_SUBTEST(test_parallelism());
+ CALL_SUBTEST(test_parallelism(true));
+ CALL_SUBTEST(test_parallelism(false));
+ CALL_SUBTEST(test_cancel());
+ CALL_SUBTEST(test_pool_partitions());
}
diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp
index 91f690114..8fc5a3074 100644
--- a/unsupported/test/cxx11_runqueue.cpp
+++ b/unsupported/test/cxx11_runqueue.cpp
@@ -227,7 +227,7 @@ void test_stress_runqueue()
VERIFY(total.load() == 0);
}
-void test_cxx11_runqueue()
+EIGEN_DECLARE_TEST(cxx11_runqueue)
{
CALL_SUBTEST_1(test_basic_runqueue());
CALL_SUBTEST_2(test_empty_runqueue());
diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp
index 037767270..4a0c8967b 100644
--- a/unsupported/test/cxx11_tensor_argmax.cpp
+++ b/unsupported/test/cxx11_tensor_argmax.cpp
@@ -273,7 +273,7 @@ static void test_argmin_dim()
}
}
-void test_cxx11_tensor_argmax()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax)
{
CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu
index 653443dc5..79f4066e9 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu
@@ -9,19 +9,18 @@
#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
using Eigen::Tensor;
template <int Layout>
-void test_cuda_simple_argmax()
+void test_gpu_simple_argmax()
{
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@@ -37,13 +36,13 @@ void test_cuda_simple_argmax()
double* d_in;
DenseIndex* d_out_max;
DenseIndex* d_out_min;
- cudaMalloc((void**)(&d_in), in_bytes);
- cudaMalloc((void**)(&d_out_max), out_bytes);
- cudaMalloc((void**)(&d_out_min), out_bytes);
+ gpuMalloc((void**)(&d_in), in_bytes);
+ gpuMalloc((void**)(&d_out_max), out_bytes);
+ gpuMalloc((void**)(&d_out_min), out_bytes);
- cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@@ -53,20 +52,20 @@ void test_cuda_simple_argmax()
gpu_out_max.device(gpu_device) = gpu_in.argmax();
gpu_out_min.device(gpu_device) = gpu_in.argmin();
- assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
- cudaFree(d_in);
- cudaFree(d_out_max);
- cudaFree(d_out_min);
+ gpuFree(d_in);
+ gpuFree(d_out_max);
+ gpuFree(d_out_min);
}
template <int DataLayout>
-void test_cuda_argmax_dim()
+void test_gpu_argmax_dim()
{
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims;
@@ -100,12 +99,12 @@ void test_cuda_argmax_dim()
float* d_in;
DenseIndex* d_out;
- cudaMalloc((void**)(&d_in), in_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in), in_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -113,8 +112,8 @@ void test_cuda_argmax_dim()
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
- assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(),
size_t(2*3*5*7 / tensor.dimension(dim)));
@@ -137,25 +136,25 @@ void test_cuda_argmax_dim()
}
}
- cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
- assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
- cudaFree(d_in);
- cudaFree(d_out);
+ gpuFree(d_in);
+ gpuFree(d_out);
}
}
template <int DataLayout>
-void test_cuda_argmin_dim()
+void test_gpu_argmin_dim()
{
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims;
@@ -189,12 +188,12 @@ void test_cuda_argmin_dim()
float* d_in;
DenseIndex* d_out;
- cudaMalloc((void**)(&d_in), in_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in), in_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@@ -202,8 +201,8 @@ void test_cuda_argmin_dim()
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
- assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(),
2*3*5*7 / tensor.dimension(dim));
@@ -226,29 +225,29 @@ void test_cuda_argmin_dim()
}
}
- cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
- assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
- cudaFree(d_in);
- cudaFree(d_out);
+ gpuFree(d_in);
+ gpuFree(d_out);
}
}
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu)
{
- CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
- CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
- CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
- CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
- CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
- CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+ CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
+ CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
+ CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
+ CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
+ CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
+ CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
new file mode 100644
index 000000000..7ac71286e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int Layout, typename DenseIndex>
+static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
+ Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+ Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
+ Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
+ in.setRandom();
+ in *= in.constant(100.0);
+ in(0, 0, 0) = -1000.0;
+ in(1, 1, 1) = 1000.0;
+
+ std::size_t in_bytes = in.size() * sizeof(DataType);
+ std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+ DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+ DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+ DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in,
+ Eigen::array<DenseIndex, 3>{{2, 2, 2}});
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max);
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min);
+ sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes);
+
+ gpu_out_max.device(sycl_device) = gpu_in.argmax();
+ gpu_out_min.device(sycl_device) = gpu_in.argmin();
+
+ sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes);
+ sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes);
+
+ VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1);
+ VERIFY_IS_EQUAL(out_min(), 0);
+
+ sycl_device.deallocate(d_in);
+ sycl_device.deallocate(d_out_max);
+ sycl_device.deallocate(d_out_min);
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
+ DenseIndex sizeDim0 = 9;
+ DenseIndex sizeDim1 = 3;
+ DenseIndex sizeDim2 = 5;
+ DenseIndex sizeDim3 = 7;
+ Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+ std::vector<DenseIndex> dims;
+ dims.push_back(sizeDim0);
+ dims.push_back(sizeDim1);
+ dims.push_back(sizeDim2);
+ dims.push_back(sizeDim3);
+ for (DenseIndex dim = 0; dim < 4; ++dim) {
+ array<DenseIndex, 3> out_shape;
+ for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+ Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+ array<DenseIndex, 4> ix;
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i;
+ ix[1] = j;
+ ix[2] = k;
+ ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
+ // = 10.0
+ tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+ }
+ }
+ }
+ }
+
+ std::size_t in_bytes = tensor.size() * sizeof(DataType);
+ std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+ DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+ DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+ d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+ size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the first index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+ }
+
+ sycl_device.synchronize();
+
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i;
+ ix[1] = j;
+ ix[2] = k;
+ ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+ tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+ }
+ }
+ }
+ }
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmax(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the last index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+ }
+ sycl_device.deallocate(d_in);
+ sycl_device.deallocate(d_out);
+ }
+}
+
+template <typename DataType, int DataLayout, typename DenseIndex>
+static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
+ DenseIndex sizeDim0 = 9;
+ DenseIndex sizeDim1 = 3;
+ DenseIndex sizeDim2 = 5;
+ DenseIndex sizeDim3 = 7;
+ Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3);
+
+ std::vector<DenseIndex> dims;
+ dims.push_back(sizeDim0);
+ dims.push_back(sizeDim1);
+ dims.push_back(sizeDim2);
+ dims.push_back(sizeDim3);
+ for (DenseIndex dim = 0; dim < 4; ++dim) {
+ array<DenseIndex, 3> out_shape;
+ for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1];
+
+ Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape);
+
+ array<DenseIndex, 4> ix;
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i;
+ ix[1] = j;
+ ix[2] = k;
+ ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+ tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+ }
+ }
+ }
+ }
+
+ std::size_t in_bytes = tensor.size() * sizeof(DataType);
+ std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+ DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes));
+ DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in(
+ d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}});
+ Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape);
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()),
+ size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim)));
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the first index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+ }
+
+ sycl_device.synchronize();
+
+ for (DenseIndex i = 0; i < sizeDim0; ++i) {
+ for (DenseIndex j = 0; j < sizeDim1; ++j) {
+ for (DenseIndex k = 0; k < sizeDim2; ++k) {
+ for (DenseIndex l = 0; l < sizeDim3; ++l) {
+ ix[0] = i;
+ ix[1] = j;
+ ix[2] = k;
+ ix[3] = l;
+ // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+ tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+ }
+ }
+ }
+ }
+
+ sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes);
+ gpu_out.device(sycl_device) = gpu_in.argmin(dim);
+ sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes);
+
+ for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+ // Expect max to be in the last index of the reduced dimension
+ VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+ }
+ sycl_device.deallocate(d_in);
+ sycl_device.deallocate(d_out);
+ }
+}
+
+template <typename DataType, typename Device_Selector>
+void sycl_argmax_test_per_device(const Device_Selector& d) {
+ QueueInterface queueInterface(d);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
+ for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp
index 8fe85d83c..ce9d24369 100644
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@@ -358,7 +358,7 @@ static void test_std_initializers_tensor() {
#endif // EIGEN_HAS_VARIADIC_TEMPLATES
}
-void test_cxx11_tensor_assign()
+EIGEN_DECLARE_TEST(cxx11_tensor_assign)
{
CALL_SUBTEST(test_1d());
CALL_SUBTEST(test_2d());
diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp
new file mode 100644
index 000000000..5fb12e0e0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_access.cpp
@@ -0,0 +1,576 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Andy Davis <andydavis@google.com>
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <algorithm>
+#include <set>
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::Index;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TensorBlockShapeType;
+
+static TensorOpCost zeroCost() { return {0, 0, 0}; }
+
+template<typename T>
+static const T& choose(int layout, const T& col, const T& row) {
+ return layout == ColMajor ? col : row;
+}
+
+static TensorBlockShapeType RandomShape() {
+ return internal::random<bool>()
+ ? TensorBlockShapeType::kUniformAllDims
+ : TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) {
+ return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims() {
+ array<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<int>(1, 20);
+ }
+ return DSizes<Index, NumDims>(dims);
+}
+
+template <typename T>
+static T* GenerateRandomData(const Index& size) {
+ T* data = new T[size];
+ for (int i = 0; i < size; ++i) {
+ data[i] = internal::random<T>();
+ }
+ return data;
+}
+
+template <int NumDims>
+static void Debug(DSizes<Index, NumDims> dims) {
+ for (int i = 0; i < NumDims; ++i) {
+ std::cout << dims[i] << "; ";
+ }
+ std::cout << std::endl;
+}
+
+template <int Layout>
+static void test_block_mapper_sanity()
+{
+ typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+ DSizes<Index, 2> tensor_dims(100, 100);
+
+ // Test uniform blocks.
+ TensorBlockMapper uniform_block_mapper(
+ tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()});
+
+ VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100);
+ VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100);
+
+ // 10x10 blocks
+ auto uniform_b0 = uniform_block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(uniform_b0.dimensions().at(0), 10);
+ VERIFY_IS_EQUAL(uniform_b0.dimensions().at(1), 10);
+
+ // Test skewed to inner dims blocks.
+ TensorBlockMapper skewed_block_mapper(
+ tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()});
+
+ VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100);
+ VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100);
+
+ // 1x100 (100x1) rows/cols depending on a tensor layout.
+ auto skewed_b0 = skewed_block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(skewed_b0.dimensions().at(0), choose(Layout, 100, 1));
+ VERIFY_IS_EQUAL(skewed_b0.dimensions().at(1), choose(Layout, 1, 100));
+}
+
+// Given a TensorBlock "visit" every element accessible though it, and a keep an
+// index in the visited set. Verify that every coeff accessed only once.
+template<int NumDims, int Layout>
+static void UpdateCoeffSet(
+ const DSizes<Index, NumDims>& tensor_strides,
+ const internal::TensorBlockDescriptor<NumDims>& block,
+ Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) {
+ const DSizes<Index, NumDims>& block_sizes = block.dimensions();
+
+ for (int i = 0; i < block_sizes[dim_index]; ++i) {
+ if (tensor_strides[dim_index] == 1) {
+ typedef std::pair<std::set<Index>::iterator, bool> ReturnType;
+ ReturnType inserted = visited_coeffs->insert(first_coeff_index + i);
+ VERIFY_IS_EQUAL(inserted.second, true);
+ } else {
+ int next_dim_index = dim_index + choose(Layout, -1, 1);
+ UpdateCoeffSet<NumDims, Layout>(tensor_strides, block, first_coeff_index,
+ next_dim_index, visited_coeffs);
+ first_coeff_index += tensor_strides[dim_index];
+ }
+ }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_mapper_maps_every_element() {
+ typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper;
+
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>();
+ DSizes<Index, NumDims> strides = internal::strides<Layout>(dims);
+
+ // Keep track of elements indices available via block access.
+ std::set<Index> coeff_set;
+
+ // Try different combinations of block types and sizes.
+ TensorBlockMapper block_mapper(
+ dims, {RandomShape(), RandomTargetSize(dims), zeroCost()});
+
+ for (int i = 0; i < block_mapper.blockCount(); ++i) {
+ auto block = block_mapper.blockDescriptor(i);
+ UpdateCoeffSet<NumDims, Layout>(strides, block, block.offset(),
+ choose(Layout, NumDims - 1, 0),
+ &coeff_set);
+ }
+
+ // Verify that every coefficient in the original Tensor is accessible through
+ // TensorBlock only once.
+ Index total_coeffs = dims.TotalSize();
+ VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs);
+ VERIFY_IS_EQUAL(*coeff_set.begin(), 0);
+ VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1);
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+ const array<Index, NumDims>& output_to_input_dim_map,
+ const array<Index, NumDims>& input_strides,
+ const array<Index, NumDims>& output_strides) {
+ int input_index = 0;
+ if (Layout == ColMajor) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = output_index / output_strides[i];
+ input_index += idx * input_strides[output_to_input_dim_map[i]];
+ output_index -= idx * output_strides[i];
+ }
+ return input_index +
+ output_index * input_strides[output_to_input_dim_map[0]];
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = output_index / output_strides[i];
+ input_index += idx * input_strides[output_to_input_dim_map[i]];
+ output_index -= idx * output_strides[i];
+ }
+ return input_index +
+ output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+ }
+}
+
+template <int Layout, int NumDims>
+static array<Index, NumDims> ComputeStrides(
+ const array<Index, NumDims>& sizes) {
+ array<Index, NumDims> strides;
+ if (Layout == ColMajor) {
+ strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ strides[i] = strides[i - 1] * sizes[i - 1];
+ }
+ } else {
+ strides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ strides[i] = strides[i + 1] * sizes[i + 1];
+ }
+ }
+ return strides;
+}
+
+template<typename Scalar, typename StorageIndex, int Dim>
+class EqualityChecker
+{
+ const Scalar* input_data;
+ const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides;
+ void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const
+ {
+ if(depth==Dim)
+ {
+ VERIFY_IS_EQUAL(*input, *output);
+ return;
+ }
+
+ for(int i=0; i<output_dims[depth]; ++i)
+ {
+ check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1);
+ }
+ }
+public:
+ EqualityChecker(const Scalar* input_data_,
+ const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_,
+ const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_)
+ : input_data(input_data_)
+ , input_dims(input_dims_), input_strides(input_strides_)
+ , output_dims(output_dims_), output_strides(output_strides_)
+ {}
+
+ void operator()(const Scalar* output_data) const
+ {
+ check_recursive(input_data, output_data);
+ }
+};
+
+template <int Layout>
+static void test_uniform_block_shape()
+{
+ typedef internal::TensorBlockDescriptor<5> TensorBlock;
+ typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+ {
+ // Test shape 'UniformAllDims' with uniform 'max_coeff count'.
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 5 * 5 * 5 * 5 * 5;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ for (int i = 0; i < 5; ++i) {
+ VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+ // partially into first inner-most dimension.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 7 * 5 * 5 * 5 * 5;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+ for (int i = 1; i < 5; ++i) {
+ VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 5 * 5 * 5 * 5 * 6;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+ for (int i = 3; i >= 0; --i) {
+ VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+ // fully into first inner-most dimension.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 11 * 5 * 5 * 5 * 5;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+ for (int i = 1; i < 5; ++i) {
+ VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 5 * 5 * 5 * 5 * 7;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ for (int i = 3; i >= 0; --i) {
+ VERIFY_IS_EQUAL(5, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills
+ // fully into first few inner-most dimensions.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+ const Index max_coeff_count = 7 * 5 * 6 * 7 * 5;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+ VERIFY_IS_EQUAL(7, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[4]);
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+ const Index max_coeff_count = 5 * 5 * 5 * 6 * 7;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY_IS_EQUAL(6, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[0]);
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'UniformAllDims' with full allocation to all dims.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(7, 5, 6, 17, 7);
+ const Index max_coeff_count = 7 * 5 * 6 * 17 * 7;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+ VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(7, 5, 6, 9, 7);
+ const Index max_coeff_count = 7 * 5 * 6 * 9 * 7;
+ TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims,
+ max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY_IS_EQUAL(9, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(7, block.dimensions()[0]);
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+}
+
+template <int Layout>
+static void test_skewed_inner_dim_block_shape()
+{
+ typedef internal::TensorBlockDescriptor<5> TensorBlock;
+ typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper;
+
+ // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 10 * 1 * 1 * 1 * 1;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(10, block.dimensions()[0]);
+ for (int i = 1; i < 5; ++i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 1 * 1 * 1 * 1 * 6;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(6, block.dimensions()[4]);
+ for (int i = 3; i >= 0; --i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'SkewedInnerDims' with full allocation to inner-most dim.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 11 * 1 * 1 * 1 * 1;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+ for (int i = 1; i < 5; ++i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 1 * 1 * 1 * 1 * 7;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ for (int i = 3; i >= 0; --i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+ // and partial allocation to second inner-dim.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 11 * 3 * 1 * 1 * 1;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+ VERIFY_IS_EQUAL(3, block.dimensions()[1]);
+ for (int i = 2; i < 5; ++i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 1 * 1 * 1 * 15 * 7;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY_IS_EQUAL(15, block.dimensions()[3]);
+ for (int i = 2; i >= 0; --i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'SkewedInnerDims' with full allocation to inner-most dim,
+ // and partial allocation to third inner-dim.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 11 * 5 * 5 * 1 * 1;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+ for (int i = 3; i < 5; ++i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 1 * 1 * 5 * 17 * 7;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[2]);
+ for (int i = 1; i >= 0; --i) {
+ VERIFY_IS_EQUAL(1, block.dimensions()[i]);
+ }
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+
+ // Test shape 'SkewedInnerDims' with full allocation to all dims.
+ if (Layout == ColMajor) {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+ VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ } else {
+ DSizes<Index, 5> dims(11, 5, 6, 17, 7);
+ const Index max_coeff_count = 11 * 5 * 6 * 17 * 7;
+ TensorBlockMapper block_mapper(
+ dims,
+ {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()});
+ TensorBlock block = block_mapper.blockDescriptor(0);
+ VERIFY_IS_EQUAL(7, block.dimensions()[4]);
+ VERIFY_IS_EQUAL(17, block.dimensions()[3]);
+ VERIFY_IS_EQUAL(6, block.dimensions()[2]);
+ VERIFY_IS_EQUAL(5, block.dimensions()[1]);
+ VERIFY_IS_EQUAL(11, block.dimensions()[0]);
+ VERIFY(block.dimensions().TotalSize() <= max_coeff_count);
+ }
+}
+
+template <int Layout>
+static void test_empty_dims(const internal::TensorBlockShapeType block_shape)
+{
+ // Test blocking of tensors with zero dimensions:
+ // - we must not crash on asserts and divisions by zero
+ // - we must not return block with zero dimensions
+ // (recipe for overflows/underflows, divisions by zero and NaNs later)
+ // - total block count must be zero
+ {
+ typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper;
+
+ DSizes<Index, 1> dims(0);
+ for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+ TensorBlockMapper block_mapper(
+ dims, {block_shape, max_coeff_count, zeroCost()});
+ VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+ VERIFY(block_mapper.blockTotalSize() >= 1);
+ }
+ }
+
+ {
+ typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper;
+
+ for (int dim1 = 0; dim1 < 3; ++dim1) {
+ for (int dim2 = 0; dim2 < 3; ++dim2) {
+ DSizes<Index, 2> dims(dim1, dim2);
+ for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) {
+ TensorBlockMapper block_mapper(
+ dims, {block_shape, max_coeff_count, zeroCost()});
+ if (dim1 * dim2 == 0) {
+ VERIFY_IS_EQUAL(block_mapper.blockCount(), 0);
+ }
+ VERIFY(block_mapper.blockTotalSize() >= 1);
+ }
+ }
+ }
+ }
+}
+
+#define TEST_LAYOUTS(NAME) \
+ CALL_SUBTEST(NAME<ColMajor>()); \
+ CALL_SUBTEST(NAME<RowMajor>())
+
+#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \
+ CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \
+ CALL_SUBTEST((NAME<TYPE, 5, RowMajor>()))
+
+#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \
+ CALL_SUBTEST(NAME<ColMajor>(ARG)); \
+ CALL_SUBTEST(NAME<RowMajor>(ARG))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_access) {
+ TEST_LAYOUTS(test_block_mapper_sanity);
+ TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element);
+ TEST_LAYOUTS(test_uniform_block_shape);
+ TEST_LAYOUTS(test_skewed_inner_dim_block_shape);
+ TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims);
+ TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims);
+}
+
+#undef TEST_LAYOUTS
+#undef TEST_LAYOUTS_WITH_ARG
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
new file mode 100644
index 000000000..b2e26ebb7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -0,0 +1,858 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+using Eigen::internal::TensorBlockDescriptor;
+using Eigen::internal::TensorExecutor;
+
+// -------------------------------------------------------------------------- //
+// Utility functions to generate random tensors, blocks, and evaluate them.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+ DSizes<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<Index>(min, max);
+ }
+ return DSizes<Index, NumDims>(dims);
+}
+
+// Block offsets and extents allows to construct a TensorSlicingOp corresponding
+// to a TensorBlockDescriptor.
+template <int NumDims>
+struct TensorBlockParams {
+ DSizes<Index, NumDims> offsets;
+ DSizes<Index, NumDims> sizes;
+ TensorBlockDescriptor<NumDims, Index> desc;
+};
+
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims,
+ Index min, Index max) {
+ // Choose random offsets and sizes along all tensor dimensions.
+ DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max));
+ DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max));
+
+ // Make sure that offset + size do not overflow dims.
+ for (int i = 0; i < NumDims; ++i) {
+ offsets[i] = numext::mini(dims[i] - 1, offsets[i]);
+ sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]);
+ }
+
+ Index offset = 0;
+ DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims);
+ for (int i = 0; i < NumDims; ++i) {
+ offset += strides[i] * offsets[i];
+ }
+
+ return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)};
+}
+
+// Generate block with block sizes skewed towards inner dimensions. This type of
+// block is required for evaluating broadcast expressions.
+template <int Layout, int NumDims>
+static TensorBlockParams<NumDims> SkewedInnerBlock(
+ DSizes<Index, NumDims> dims) {
+ using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>;
+ BlockMapper block_mapper(dims,
+ {internal::TensorBlockShapeType::kSkewedInnerDims,
+ internal::random<size_t>(1, dims.TotalSize()),
+ {0, 0, 0}});
+
+ Index total_blocks = block_mapper.blockCount();
+ Index block_index = internal::random<Index>(0, total_blocks - 1);
+ auto block = block_mapper.blockDescriptor(block_index);
+ DSizes<Index, NumDims> sizes = block.dimensions();
+
+ auto strides = internal::strides<Layout>(dims);
+ DSizes<Index, NumDims> offsets;
+
+ // Compute offsets for the first block coefficient.
+ Index index = block.offset();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / strides[i];
+ index -= idx * strides[i];
+ offsets[i] = idx;
+ }
+ if (NumDims > 0) offsets[0] = index;
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / strides[i];
+ index -= idx * strides[i];
+ offsets[i] = idx;
+ }
+ if (NumDims > 0) offsets[NumDims - 1] = index;
+ }
+
+ return {offsets, sizes, block};
+}
+
+template <int NumDims>
+static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) {
+ DSizes<Index, NumDims> offsets;
+ for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+
+ return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)};
+}
+
+inline Eigen::IndexList<Index, Eigen::type2index<1>> NByOne(Index n) {
+ Eigen::IndexList<Index, Eigen::type2index<1>> ret;
+ ret.set(0, n);
+ return ret;
+}
+inline Eigen::IndexList<Eigen::type2index<1>, Index> OneByM(Index m) {
+ Eigen::IndexList<Eigen::type2index<1>, Index> ret;
+ ret.set(1, m);
+ return ret;
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that block expression evaluation produces the same result as a
+// TensorSliceOp (reading a tensor block is same to taking a tensor slice).
+
+template <typename T, int NumDims, int Layout, typename Expression,
+ typename GenBlockParams>
+static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) {
+ using Device = DefaultDevice;
+ auto d = Device();
+
+ // Scratch memory allocator for block evaluation.
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+ TensorBlockScratch scratch(d);
+
+ // TensorEvaluator is needed to produce tensor blocks of the expression.
+ auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d);
+ eval.evalSubExprsIfNeeded(nullptr);
+
+ // Choose a random offsets, sizes and TensorBlockDescriptor.
+ TensorBlockParams<NumDims> block_params = gen_block();
+
+ // Evaluate TensorBlock expression into a tensor.
+ Tensor<T, NumDims, Layout> block(block_params.desc.dimensions());
+
+ // Dimensions for the potential destination buffer.
+ DSizes<Index, NumDims> dst_dims;
+ if (internal::random<bool>()) {
+ dst_dims = block_params.desc.dimensions();
+ } else {
+ for (int i = 0; i < NumDims; ++i) {
+ Index extent = internal::random<Index>(0, 5);
+ dst_dims[i] = block_params.desc.dimension(i) + extent;
+ }
+ }
+
+ // Maybe use this tensor as a block desc destination.
+ Tensor<T, NumDims, Layout> dst(dst_dims);
+ dst.setZero();
+ if (internal::random<bool>()) {
+ block_params.desc.template AddDestinationBuffer<Layout>(
+ dst.data(), internal::strides<Layout>(dst.dimensions()));
+ }
+
+ const bool root_of_expr = internal::random<bool>();
+ auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr);
+
+ if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) {
+ // Copy data from destination buffer.
+ if (dimensions_match(dst.dimensions(), block.dimensions())) {
+ block = dst;
+ } else {
+ DSizes<Index, NumDims> offsets;
+ for (int i = 0; i < NumDims; ++i) offsets[i] = 0;
+ block = dst.slice(offsets, block.dimensions());
+ }
+
+ } else {
+ // Assign to block from expression.
+ auto b_expr = tensor_block.expr();
+
+ // We explicitly disable vectorization and tiling, to run a simple coefficient
+ // wise assignment loop, because it's very simple and should be correct.
+ using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>;
+ using BlockExecutor = TensorExecutor<const BlockAssign, Device, false,
+ internal::TiledEvaluation::Off>;
+ BlockExecutor::run(BlockAssign(block, b_expr), d);
+ }
+
+ // Cleanup temporary buffers owned by a tensor block.
+ tensor_block.cleanup();
+
+ // Compute a Tensor slice corresponding to a Tensor block.
+ Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions());
+ auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+ // Explicitly use coefficient assignment to evaluate slice expression.
+ using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>;
+ using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+ internal::TiledEvaluation::Off>;
+ SliceExecutor::run(SliceAssign(slice, s_expr), d);
+
+ // Tensor block and tensor slice must be the same.
+ for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
+ }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_block() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ // Identity tensor expression transformation.
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_unary_expr_block() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_expr_block() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+ lhs.setRandom();
+ rhs.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_binary_with_unary_expr_block() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> lhs(dims), rhs(dims);
+ lhs.setRandom();
+ rhs.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ (lhs.square() + rhs.square()).sqrt(),
+ [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_broadcast() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5);
+
+ DSizes<Index, NumDims> bcasted_dims;
+ for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i];
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.broadcast(bcast),
+ [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.broadcast(bcast),
+ [&bcasted_dims]() { return RandomBlock<Layout>(bcasted_dims, 5, 10); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.broadcast(bcast),
+ [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); });
+
+ // Check that desc.destination() memory is not shared between two broadcast
+ // materializations.
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.broadcast(bcast) * input.abs().broadcast(bcast),
+ [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reshape() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
+
+ DSizes<Index, NumDims> shuffled = dims;
+ std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.reshape(shuffled),
+ [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.reshape(shuffled),
+ [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_cast() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.template cast<int>().template cast<T>(),
+ [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_select() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> lhs(dims);
+ Tensor<T, NumDims, Layout> rhs(dims);
+ Tensor<bool, NumDims, Layout> cond(dims);
+ lhs.setRandom();
+ rhs.setRandom();
+ cond.setRandom();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() {
+ return RandomBlock<Layout>(dims, 1, 20);
+ });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_padding() {
+ const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4);
+ DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4);
+ array<std::pair<Index, Index>, NumDims> paddings;
+ for (int i = 0; i < NumDims; ++i) {
+ paddings[i] = std::make_pair(pad_before[i], pad_after[i]);
+ }
+
+ // Test squeezing reads from inner dim.
+ if (internal::random<bool>()) {
+ pad_before[inner_dim] = 0;
+ pad_after[inner_dim] = 0;
+ paddings[inner_dim] = std::make_pair(0, 0);
+ }
+
+ DSizes<Index, NumDims> padded_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ padded_dims[i] = dims[i] + pad_before[i] + pad_after[i];
+ }
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.pad(paddings),
+ [&padded_dims]() { return FixedSizeBlock(padded_dims); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.pad(paddings),
+ [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.pad(paddings),
+ [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_chipping() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ Index chip_dim = internal::random<int>(0, NumDims - 1);
+ Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+ DSizes<Index, NumDims - 1> chipped_dims;
+ for (Index i = 0; i < chip_dim; ++i) {
+ chipped_dims[i] = dims[i];
+ }
+ for (Index i = chip_dim + 1; i < NumDims; ++i) {
+ chipped_dims[i - 1] = dims[i];
+ }
+
+ // Block buffer forwarding.
+ VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+ input.chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+ VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+ input.chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+ // Block expression assignment.
+ VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+ input.abs().chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+ VerifyBlockEvaluator<T, NumDims - 1, Layout>(
+ input.abs().chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+}
+
+
+template<typename T, int NumDims>
+struct SimpleTensorGenerator {
+ T operator()(const array<Index, NumDims>& coords) const {
+ T result = static_cast<T>(0);
+ for (int i = 0; i < NumDims; ++i) {
+ result += static_cast<T>((i + 1) * coords[i]);
+ }
+ return result;
+ }
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template<int NumDims>
+struct SimpleTensorGenerator<bool, NumDims> {
+ bool operator()(const array<Index, NumDims>& coords) const {
+ bool result = false;
+ for (int i = 0; i < NumDims; ++i) {
+ result ^= coords[i];
+ }
+ return result;
+ }
+};
+
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_generator() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ auto generator = SimpleTensorGenerator<T, NumDims>();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.generate(generator), [&dims]() { return FixedSizeBlock(dims); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.generate(generator),
+ [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_reverse() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ // Randomly reverse dimensions.
+ Eigen::DSizes<bool, NumDims> reverse;
+ for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() {
+ return RandomBlock<Layout>(dims, 1, 10);
+ });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_slice() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ // Pick a random slice of an input tensor.
+ DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+ DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.slice(slice_start, slice_size),
+ [&slice_size]() { return FixedSizeBlock(slice_size); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.slice(slice_start, slice_size),
+ [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_shuffle() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ DSizes<Index, NumDims> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+ do {
+ DSizes<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.shuffle(shuffle),
+ [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(
+ input.shuffle(shuffle), [&shuffled_dims]() {
+ return RandomBlock<Layout>(shuffled_dims, 1, 5);
+ });
+
+ break;
+
+ } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_reshape_with_bcast() {
+ Index dim = internal::random<Index>(1, 100);
+
+ Tensor<T, 2, Layout> lhs(1, dim);
+ Tensor<T, 2, Layout> rhs(dim, 1);
+ lhs.setRandom();
+ rhs.setRandom();
+
+ auto reshapeLhs = NByOne(dim);
+ auto reshapeRhs = OneByM(dim);
+
+ auto bcastLhs = OneByM(dim);
+ auto bcastRhs = NByOne(dim);
+
+ DSizes<Index, 2> dims(dim, dim);
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ lhs.reshape(reshapeLhs).broadcast(bcastLhs) *
+ rhs.reshape(reshapeRhs).broadcast(bcastRhs),
+ [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_forced_eval() {
+ Index dim = internal::random<Index>(1, 100);
+
+ Tensor<T, 2, Layout> lhs(dim, 1);
+ Tensor<T, 2, Layout> rhs(1, dim);
+ lhs.setRandom();
+ rhs.setRandom();
+
+ auto bcastLhs = OneByM(dim);
+ auto bcastRhs = NByOne(dim);
+
+ DSizes<Index, 2> dims(dim, dim);
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+ [dims]() { return SkewedInnerBlock<Layout, 2>(dims); });
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims),
+ [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); });
+}
+
+template <typename T, int Layout>
+static void test_eval_tensor_chipping_of_bcast() {
+ if (Layout != static_cast<int>(RowMajor)) return;
+
+ Index dim0 = internal::random<Index>(1, 10);
+ Index dim1 = internal::random<Index>(1, 10);
+ Index dim2 = internal::random<Index>(1, 10);
+
+ Tensor<T, 3, Layout> input(1, dim1, dim2);
+ input.setRandom();
+
+ Eigen::array<Index, 3> bcast = {{dim0, 1, 1}};
+ DSizes<Index, 2> chipped_dims(dim0, dim2);
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ input.broadcast(bcast).chip(0, 1),
+ [chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ input.broadcast(bcast).chip(0, 1),
+ [chipped_dims]() { return SkewedInnerBlock<Layout, 2>(chipped_dims); });
+
+ VerifyBlockEvaluator<T, 2, Layout>(
+ input.broadcast(bcast).chip(0, 1),
+ [chipped_dims]() { return RandomBlock<Layout, 2>(chipped_dims, 1, 5); });
+}
+
+// -------------------------------------------------------------------------- //
+// Verify that assigning block to a Tensor expression produces the same result
+// as an assignment to TensorSliceOp (writing a block is is identical to
+// assigning one tensor to a slice of another tensor).
+
+template <typename T, int NumDims, int Layout, int NumExprDims = NumDims,
+ typename Expression, typename GenBlockParams>
+static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor,
+ Expression expr, GenBlockParams gen_block) {
+ using Device = DefaultDevice;
+ auto d = Device();
+
+ // We use tensor evaluator as a target for block and slice assignments.
+ auto eval = TensorEvaluator<decltype(expr), Device>(expr, d);
+
+ // Generate a random block, or choose a block that fits in full expression.
+ TensorBlockParams<NumExprDims> block_params = gen_block();
+
+ // Generate random data of the selected block size.
+ Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions());
+ block.setRandom();
+
+ // ************************************************************************ //
+ // (1) Assignment from a block.
+
+ // Construct a materialize block from a random generated block tensor.
+ internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk(
+ internal::TensorBlockKind::kView, block.data(), block.dimensions());
+
+ // Reset all underlying tensor values to zero.
+ tensor.setZero();
+
+ // Use evaluator to write block into a tensor.
+ eval.writeBlock(block_params.desc, blk);
+
+ // Make a copy of the result after assignment.
+ Tensor<T, NumDims, Layout> block_assigned = tensor;
+
+ // ************************************************************************ //
+ // (2) Assignment to a slice
+
+ // Reset all underlying tensor values to zero.
+ tensor.setZero();
+
+ // Assign block to a slice of original expression
+ auto s_expr = expr.slice(block_params.offsets, block_params.sizes);
+
+ // Explicitly use coefficient assignment to evaluate slice expression.
+ using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>;
+ using SliceExecutor = TensorExecutor<const SliceAssign, Device, false,
+ internal::TiledEvaluation::Off>;
+ SliceExecutor::run(SliceAssign(s_expr, block), d);
+
+ // Make a copy of the result after assignment.
+ Tensor<T, NumDims, Layout> slice_assigned = tensor;
+
+ for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i));
+ }
+}
+
+// -------------------------------------------------------------------------- //
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> tensor(dims);
+
+ TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); });
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map, [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_reshape() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> tensor(dims);
+
+ TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+ DSizes<Index, NumDims> shuffled = dims;
+ std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed));
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.reshape(shuffled),
+ [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); });
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.reshape(shuffled),
+ [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); });
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.reshape(shuffled),
+ [&shuffled]() { return FixedSizeBlock(shuffled); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_chipping() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> tensor(dims);
+
+ Index chip_dim = internal::random<int>(0, NumDims - 1);
+ Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2);
+
+ DSizes<Index, NumDims - 1> chipped_dims;
+ for (Index i = 0; i < chip_dim; ++i) {
+ chipped_dims[i] = dims[i];
+ }
+ for (Index i = chip_dim + 1; i < NumDims; ++i) {
+ chipped_dims[i - 1] = dims[i];
+ }
+
+ TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+ VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+ tensor, map.chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); });
+
+ VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+ tensor, map.chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); });
+
+ VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>(
+ tensor, map.chip(chip_offset, chip_dim),
+ [&chipped_dims]() { return FixedSizeBlock(chipped_dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_slice() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, Layout> tensor(dims);
+
+ // Pick a random slice of tensor.
+ DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10);
+ DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10);
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.slice(slice_start, slice_size),
+ [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); });
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.slice(slice_start, slice_size),
+ [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); });
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.slice(slice_start, slice_size),
+ [&slice_size]() { return FixedSizeBlock(slice_size); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_assign_to_tensor_shuffle() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15);
+ Tensor<T, NumDims, Layout> tensor(dims);
+
+ DSizes<Index, NumDims> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+ TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims);
+
+ do {
+ DSizes<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]];
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.shuffle(shuffle),
+ [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); });
+
+ VerifyBlockAssignment<T, NumDims, Layout>(
+ tensor, map.shuffle(shuffle), [&shuffled_dims]() {
+ return RandomBlock<Layout>(shuffled_dims, 1, 5);
+ });
+
+ } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+// -------------------------------------------------------------------------- //
+
+#define CALL_SUBTEST_PART(PART) \
+ CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME) \
+ CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME) \
+ CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>()))
+
+#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME) \
+ CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) {
+ // clang-format off
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle);
+
+ CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast);
+ CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval);
+ CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast);
+
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle);
+
+ // Force CMake to split this test.
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
+ // clang-format on
+}
diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp
new file mode 100644
index 000000000..52f7dde9b
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_block_io.cpp
@@ -0,0 +1,445 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// clang-format off
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+// clang-format on
+
+// -------------------------------------------------------------------------- //
+// A set of tests for TensorBlockIO: copying data between tensor blocks.
+
+template <int NumDims>
+static DSizes<Index, NumDims> RandomDims(Index min, Index max) {
+ DSizes<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<Index>(min, max);
+ }
+ return DSizes<Index, NumDims>(dims);
+}
+
+static internal::TensorBlockShapeType RandomBlockShape() {
+ return internal::random<bool>()
+ ? internal::TensorBlockShapeType::kUniformAllDims
+ : internal::TensorBlockShapeType::kSkewedInnerDims;
+}
+
+template <int NumDims>
+static size_t RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) {
+ return internal::random<size_t>(1, dims.TotalSize());
+}
+
+template <int Layout, int NumDims>
+static Index GetInputIndex(Index output_index,
+ const array<Index, NumDims>& output_to_input_dim_map,
+ const array<Index, NumDims>& input_strides,
+ const array<Index, NumDims>& output_strides) {
+ int input_index = 0;
+ if (Layout == ColMajor) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = output_index / output_strides[i];
+ input_index += idx * input_strides[output_to_input_dim_map[i]];
+ output_index -= idx * output_strides[i];
+ }
+ return input_index +
+ output_index * input_strides[output_to_input_dim_map[0]];
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = output_index / output_strides[i];
+ input_index += idx * input_strides[output_to_input_dim_map[i]];
+ output_index -= idx * output_strides[i];
+ }
+ return input_index +
+ output_index * input_strides[output_to_input_dim_map[NumDims - 1]];
+ }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_data_from_source_to_target() {
+ using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+ using IODst = typename TensorBlockIO::Dst;
+ using IOSrc = typename TensorBlockIO::Src;
+
+ // Generate a random input Tensor.
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ // Write data to an output Tensor.
+ Tensor<T, NumDims, Layout> output(dims);
+
+ // Construct a tensor block mapper.
+ using TensorBlockMapper =
+ internal::TensorBlockMapper<NumDims, Layout, Index>;
+ TensorBlockMapper block_mapper(
+ dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}});
+
+ // We will copy data from input to output through this buffer.
+ Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+ // Precompute strides for TensorBlockIO::Copy.
+ auto input_strides = internal::strides<Layout>(dims);
+ auto output_strides = internal::strides<Layout>(dims);
+
+ const T* input_data = input.data();
+ T* output_data = output.data();
+ T* block_data = block.data();
+
+ for (int i = 0; i < block_mapper.blockCount(); ++i) {
+ auto desc = block_mapper.blockDescriptor(i);
+
+ auto blk_dims = desc.dimensions();
+ auto blk_strides = internal::strides<Layout>(blk_dims);
+
+ {
+ // Read from input into a block buffer.
+ IODst dst(blk_dims, blk_strides, block_data, 0);
+ IOSrc src(input_strides, input_data, desc.offset());
+
+ TensorBlockIO::Copy(dst, src);
+ }
+
+ {
+ // Write from block buffer to output.
+ IODst dst(blk_dims, output_strides, output_data, desc.offset());
+ IOSrc src(blk_strides, block_data, 0);
+
+ TensorBlockIO::Copy(dst, src);
+ }
+ }
+
+ for (int i = 0; i < dims.TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+ }
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_block_io_copy_using_reordered_dimensions() {
+ // Generate a random input Tensor.
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ // Create a random dimension re-ordering/shuffle.
+ std::vector<int> shuffle;
+
+ for (int i = 0; i < NumDims; ++i) shuffle.push_back(i);
+ std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed));
+
+ DSizes<Index, NumDims> output_tensor_dims;
+ DSizes<Index, NumDims> input_to_output_dim_map;
+ DSizes<Index, NumDims> output_to_input_dim_map;
+ for (Index i = 0; i < NumDims; ++i) {
+ output_tensor_dims[shuffle[i]] = dims[i];
+ input_to_output_dim_map[i] = shuffle[i];
+ output_to_input_dim_map[shuffle[i]] = i;
+ }
+
+ // Write data to an output Tensor.
+ Tensor<T, NumDims, Layout> output(output_tensor_dims);
+
+ // Construct a tensor block mapper.
+ // NOTE: Tensor block mapper works with shuffled dimensions.
+ using TensorBlockMapper =
+ internal::TensorBlockMapper<NumDims, Layout, Index>;
+ TensorBlockMapper block_mapper(output_tensor_dims,
+ {RandomBlockShape(),
+ RandomTargetBlockSize(output_tensor_dims),
+ {0, 0, 0}});
+
+ // We will copy data from input to output through this buffer.
+ Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions());
+
+ // Precompute strides for TensorBlockIO::Copy.
+ auto input_strides = internal::strides<Layout>(dims);
+ auto output_strides = internal::strides<Layout>(output_tensor_dims);
+
+ const T* input_data = input.data();
+ T* output_data = output.data();
+ T* block_data = block.data();
+
+ for (Index i = 0; i < block_mapper.blockCount(); ++i) {
+ auto desc = block_mapper.blockDescriptor(i);
+
+ const Index first_coeff_index = GetInputIndex<Layout, NumDims>(
+ desc.offset(), output_to_input_dim_map, input_strides,
+ output_strides);
+
+ // NOTE: Block dimensions are in the same order as output dimensions.
+
+ using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>;
+ using IODst = typename TensorBlockIO::Dst;
+ using IOSrc = typename TensorBlockIO::Src;
+
+ auto blk_dims = desc.dimensions();
+ auto blk_strides = internal::strides<Layout>(blk_dims);
+
+ {
+ // Read from input into a block buffer.
+ IODst dst(blk_dims, blk_strides, block_data, 0);
+ IOSrc src(input_strides, input_data, first_coeff_index);
+
+ // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+ DSizes<int, NumDims> dim_map;
+ for (int j = 0; j < NumDims; ++j)
+ dim_map[j] = static_cast<int>(output_to_input_dim_map[j]);
+ TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+ }
+
+ {
+ // We need to convert block dimensions from output to input order.
+ auto dst_dims = blk_dims;
+ for (int out_dim = 0; out_dim < NumDims; ++out_dim) {
+ dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim];
+ }
+
+ // Write from block buffer to output.
+ IODst dst(dst_dims, input_strides, output_data, first_coeff_index);
+ IOSrc src(blk_strides, block_data, 0);
+
+ // TODO(ezhulenev): Remove when fully switched to TensorBlock.
+ DSizes<int, NumDims> dim_map;
+ for (int j = 0; j < NumDims; ++j)
+ dim_map[j] = static_cast<int>(input_to_output_dim_map[j]);
+ TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map);
+ }
+ }
+
+ for (Index i = 0; i < dims.TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(input_data[i], output_data[i]);
+ }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads along inner dimensions
+// in this case is illegal, because we reorder innermost dimension.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() {
+ DSizes<Index, 3> tensor_dims(7, 9, 7);
+ DSizes<Index, 3> block_dims = tensor_dims;
+
+ DSizes<int, 3> block_to_tensor_dim;
+ block_to_tensor_dim[0] = 2;
+ block_to_tensor_dim[1] = 1;
+ block_to_tensor_dim[2] = 0;
+
+ auto tensor_strides = internal::strides<Layout>(tensor_dims);
+ auto block_strides = internal::strides<Layout>(block_dims);
+
+ Tensor<float, 3, Layout> block(block_dims);
+ Tensor<float, 3, Layout> tensor(tensor_dims);
+ tensor.setRandom();
+
+ float* tensor_data = tensor.data();
+ float* block_data = block.data();
+
+ using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>;
+ using IODst = typename TensorBlockIO::Dst;
+ using IOSrc = typename TensorBlockIO::Src;
+
+ // Read from a tensor into a block.
+ IODst dst(block_dims, block_strides, block_data, 0);
+ IOSrc src(tensor_strides, tensor_data, 0);
+
+ TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+ TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims);
+ TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+ for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+ for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+ for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+ float block_value = block_tensor(d2, d1, d0);
+ float tensor_value = tensor_tensor(d0, d1, d2);
+ VERIFY_IS_EQUAL(block_value, tensor_value);
+ }
+ }
+ }
+}
+
+// This is the special case for reading data with reordering, when dimensions
+// before/after reordering are the same. Squeezing reads in this case is allowed
+// because we reorder outer dimensions.
+template <int Layout>
+static void test_block_io_copy_using_reordered_dimensions_squeeze() {
+ DSizes<Index, 4> tensor_dims(7, 5, 9, 9);
+ DSizes<Index, 4> block_dims = tensor_dims;
+
+ DSizes<int, 4> block_to_tensor_dim;
+ block_to_tensor_dim[0] = 0;
+ block_to_tensor_dim[1] = 1;
+ block_to_tensor_dim[2] = 3;
+ block_to_tensor_dim[3] = 2;
+
+ auto tensor_strides = internal::strides<Layout>(tensor_dims);
+ auto block_strides = internal::strides<Layout>(block_dims);
+
+ Tensor<float, 4, Layout> block(block_dims);
+ Tensor<float, 4, Layout> tensor(tensor_dims);
+ tensor.setRandom();
+
+ float* tensor_data = tensor.data();
+ float* block_data = block.data();
+
+ using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>;
+ using IODst = typename TensorBlockIO::Dst;
+ using IOSrc = typename TensorBlockIO::Src;
+
+ // Read from a tensor into a block.
+ IODst dst(block_dims, block_strides, block_data, 0);
+ IOSrc src(tensor_strides, tensor_data, 0);
+
+ TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim);
+
+ TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims);
+ TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims);
+
+ for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) {
+ for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) {
+ for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) {
+ for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) {
+ float block_value = block_tensor(d0, d1, d3, d2);
+ float tensor_value = tensor_tensor(d0, d1, d2, d3);
+ VERIFY_IS_EQUAL(block_value, tensor_value);
+ }
+ }
+ }
+ }
+}
+
+template <int Layout>
+static void test_block_io_zero_stride() {
+ DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30);
+
+ DSizes<Index, 5> input_tensor_dims = rnd_dims;
+ input_tensor_dims[0] = 1;
+ input_tensor_dims[2] = 1;
+ input_tensor_dims[4] = 1;
+
+ Tensor<float, 5, Layout> input(input_tensor_dims);
+ input.setRandom();
+
+ DSizes<Index, 5> output_tensor_dims = rnd_dims;
+
+ auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims);
+ auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims);
+
+ auto input_tensor_strides_with_zeros = input_tensor_strides;
+ input_tensor_strides_with_zeros[0] = 0;
+ input_tensor_strides_with_zeros[2] = 0;
+ input_tensor_strides_with_zeros[4] = 0;
+
+ Tensor<float, 5, Layout> output(output_tensor_dims);
+ output.setRandom();
+
+ using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+ using IODst = typename TensorBlockIO::Dst;
+ using IOSrc = typename TensorBlockIO::Src;
+
+ // Write data from input to output with broadcasting in dims [0, 2, 4].
+ IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0);
+ IOSrc src(input_tensor_strides_with_zeros, input.data(), 0);
+ TensorBlockIO::Copy(dst, src);
+
+ for (int i = 0; i < output_tensor_dims[0]; ++i) {
+ for (int j = 0; j < output_tensor_dims[1]; ++j) {
+ for (int k = 0; k < output_tensor_dims[2]; ++k) {
+ for (int l = 0; l < output_tensor_dims[3]; ++l) {
+ for (int m = 0; m < output_tensor_dims[4]; ++m) {
+ float input_value = input(0, j, 0, l, 0);
+ float output_value = output(i, j, k, l, m);
+ VERIFY_IS_EQUAL(input_value, output_value);
+ }
+ }
+ }
+ }
+ }
+}
+
+template <int Layout>
+static void test_block_io_squeeze_ones() {
+ using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>;
+ using IODst = typename TensorBlockIO::Dst;
+ using IOSrc = typename TensorBlockIO::Src;
+
+ // Total size > 1.
+ {
+ DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1);
+ auto strides = internal::strides<Layout>(block_sizes);
+
+ // Create a random input tensor.
+ Tensor<float, 5> input(block_sizes);
+ input.setRandom();
+
+ Tensor<float, 5> output(block_sizes);
+
+ IODst dst(block_sizes, strides, output.data(), 0);
+ IOSrc src(strides, input.data());
+ TensorBlockIO::Copy(dst, src);
+
+ for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+ }
+ }
+
+ // Total size == 1.
+ {
+ DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1);
+ auto strides = internal::strides<Layout>(block_sizes);
+
+ // Create a random input tensor.
+ Tensor<float, 5> input(block_sizes);
+ input.setRandom();
+
+ Tensor<float, 5> output(block_sizes);
+
+ IODst dst(block_sizes, strides, output.data(), 0);
+ IOSrc src(strides, input.data());
+ TensorBlockIO::Copy(dst, src);
+
+ for (Index i = 0; i < block_sizes.TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(output.data()[i], input.data()[i]);
+ }
+ }
+}
+
+#define CALL_SUBTESTS(NAME) \
+ CALL_SUBTEST((NAME<float, 1, RowMajor>())); \
+ CALL_SUBTEST((NAME<float, 2, RowMajor>())); \
+ CALL_SUBTEST((NAME<float, 4, RowMajor>())); \
+ CALL_SUBTEST((NAME<float, 5, RowMajor>())); \
+ CALL_SUBTEST((NAME<float, 1, ColMajor>())); \
+ CALL_SUBTEST((NAME<float, 2, ColMajor>())); \
+ CALL_SUBTEST((NAME<float, 4, ColMajor>())); \
+ CALL_SUBTEST((NAME<float, 5, ColMajor>())); \
+ CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \
+ CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \
+ CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \
+ CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \
+ CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \
+ CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \
+ CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \
+ CALL_SUBTEST((NAME<bool, 5, ColMajor>()))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_block_io) {
+ // clang-format off
+ CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target);
+ CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions);
+
+ CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>());
+ CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>());
+
+ CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>());
+ CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>());
+
+ CALL_SUBTEST(test_block_io_zero_stride<RowMajor>());
+ CALL_SUBTEST(test_block_io_zero_stride<ColMajor>());
+
+ CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>());
+ CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>());
+ // clang-format on
+}
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 7201bfe37..20f84b8e0 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -13,8 +13,8 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
@@ -25,50 +25,120 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){
// BROADCAST test:
- array<int, 4> in_range = {{2, 3, 5, 7}};
- array<int, 4> broadcasts = {{2, 3, 1, 4}};
- array<int, 4> out_range; // = in_range * broadcasts
+ IndexType inDim1=2;
+ IndexType inDim2=3;
+ IndexType inDim3=5;
+ IndexType inDim4=7;
+ IndexType bDim1=2;
+ IndexType bDim2=3;
+ IndexType bDim3=1;
+ IndexType bDim4=4;
+ array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}};
+ array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+ array<IndexType, 4> out_range; // = in_range * broadcasts
for (size_t i = 0; i < out_range.size(); ++i)
out_range[i] = in_range[i] * broadcasts[i];
- Tensor<float, 4> input(in_range);
- Tensor<float, 4> out(out_range);
+ Tensor<DataType, 4, DataLayout, IndexType> input(in_range);
+ Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
for (size_t i = 0; i < in_range.size(); ++i)
VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
- for (int i = 0; i < input.size(); ++i)
- input(i) = static_cast<float>(i);
+ for (IndexType i = 0; i < input.size(); ++i)
+ input(i) = static_cast<DataType>(i);
- float * gpu_in_data = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
- float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+ DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
- TensorMap<Tensor<float, 4>> gpu_in(gpu_in_data, in_range);
- TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
- sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
+ TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+ sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < 9; ++j) {
- for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 28; ++l) {
+ for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+ for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+ for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+ for (IndexType l = 0; l < inDim4*bDim4; ++l) {
VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
}
}
}
}
+ printf("Broadcast Test with fixed size Passed\n");
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+ // BROADCAST test:
+ IndexType inDim1=2;
+ IndexType inDim2=3;
+ IndexType inDim3=5;
+ IndexType inDim4=7;
+ IndexType bDim1=2;
+ IndexType bDim2=3;
+ IndexType bDim3=1;
+ IndexType bDim4=4;
+ array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}};
+ array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
+ array<IndexType, 4> out_range; // = in_range * broadcasts
+ for (size_t i = 0; i < out_range.size(); ++i)
+ out_range[i] = in_range[i] * broadcasts[i];
+
+ Tensor<DataType, 4, DataLayout, IndexType> input(in_range);
+ Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+
+ for (size_t i = 0; i < in_range.size(); ++i)
+ VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+ for (IndexType i = 0; i < input.size(); ++i)
+ input(i) = static_cast<DataType>(i);
+
+ DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
+ sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+ gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+ for (IndexType i = 0; i < inDim1*bDim1; ++i) {
+ for (IndexType j = 0; j < inDim2*bDim2; ++j) {
+ for (IndexType k = 0; k < inDim3*bDim3; ++k) {
+ for (IndexType l = 0; l < inDim4*bDim4; ++l) {
+ VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l));
+ }
+ }
+ }
+ }
printf("Broadcast Test Passed\n");
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
-void test_cxx11_tensor_broadcast_sycl() {
- cl::sycl::gpu_selector s;
- Eigen::SyclDevice sycl_device(s);
- CALL_SUBTEST(test_broadcast_sycl(sycl_device));
+template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){
+ std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+ QueueInterface queueInterface(d);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
+ test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
+ }
}
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 5c0ea5889..d3dab891f 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -91,7 +91,16 @@ static void test_vectorized_broadcasting()
}
}
+#if EIGEN_HAS_VARIADIC_TEMPLATES
tensor.resize(11,3,5);
+#else
+ array<Index, 3> new_dims;
+ new_dims[0] = 11;
+ new_dims[1] = 3;
+ new_dims[2] = 5;
+ tensor.resize(new_dims);
+#endif
+
tensor.setRandom();
broadcast = tensor.broadcast(broadcasts);
@@ -115,7 +124,7 @@ static void test_static_broadcasting()
Tensor<float, 3, DataLayout> tensor(8,3,5);
tensor.setRandom();
-#if EIGEN_HAS_CONSTEXPR
+#if defined(EIGEN_HAS_INDEX_LIST)
Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
#else
Eigen::array<int, 3> broadcasts;
@@ -139,7 +148,16 @@ static void test_static_broadcasting()
}
}
+#if EIGEN_HAS_VARIADIC_TEMPLATES
tensor.resize(11,3,5);
+#else
+ array<Index, 3> new_dims;
+ new_dims[0] = 11;
+ new_dims[1] = 3;
+ new_dims[2] = 5;
+ tensor.resize(new_dims);
+#endif
+
tensor.setRandom();
broadcast = tensor.broadcast(broadcasts);
@@ -180,8 +198,119 @@ static void test_fixed_size_broadcasting()
#endif
}
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+ Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+ tensor.setRandom();
+ array<ptrdiff_t, 4> broadcasts;
+ broadcasts[0] = 9;
+ broadcasts[1] = 1;
+ broadcasts[2] = 1;
+ broadcasts[3] = 1;
+ Tensor<float, 4, DataLayout> broadcast;
+ broadcast = tensor.broadcast(broadcasts);
+
+ VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+ VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+ VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+ VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+ for (int i = 0; i < 9; ++i) {
+ for (int j = 0; j < 13; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+ }
+ }
+ }
+ }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+ Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+ tensor.setRandom();
+ array<ptrdiff_t, 4> broadcasts;
+ broadcasts[0] = 1;
+ broadcasts[1] = 1;
+ broadcasts[2] = 1;
+ broadcasts[3] = 19;
+ Tensor<float, 4, DataLayout> broadcast;
+ broadcast = tensor.broadcast(broadcasts);
+
+ VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+ VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+ VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+ VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 19; ++l) {
+ VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+ }
+ }
+ }
+ }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_1d()
+{
+ Tensor<float, 3, DataLayout> tensor(1,7,1);
+ tensor.setRandom();
+ array<ptrdiff_t, 3> broadcasts;
+ broadcasts[0] = 5;
+ broadcasts[1] = 1;
+ broadcasts[2] = 13;
+ Tensor<float, 3, DataLayout> broadcasted;
+ broadcasted = tensor.broadcast(broadcasts);
+
+ VERIFY_IS_EQUAL(broadcasted.dimension(0), 5);
+ VERIFY_IS_EQUAL(broadcasted.dimension(1), 7);
+ VERIFY_IS_EQUAL(broadcasted.dimension(2), 13);
+
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ for (int k = 0; k < 13; ++k) {
+ VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k));
+ }
+ }
+ }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n_by_one_2d()
+{
+ Tensor<float, 4, DataLayout> tensor(1,7,13,1);
+ tensor.setRandom();
+ array<ptrdiff_t, 4> broadcasts;
+ broadcasts[0] = 5;
+ broadcasts[1] = 1;
+ broadcasts[2] = 1;
+ broadcasts[3] = 19;
+ Tensor<float, 4, DataLayout> broadcast;
+ broadcast = tensor.broadcast(broadcasts);
+
+ VERIFY_IS_EQUAL(broadcast.dimension(0), 5);
+ VERIFY_IS_EQUAL(broadcast.dimension(1), 7);
+ VERIFY_IS_EQUAL(broadcast.dimension(2), 13);
+ VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ for (int k = 0; k < 13; ++k) {
+ for (int l = 0; l < 19; ++l) {
+ VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l));
+ }
+ }
+ }
+ }
+}
-void test_cxx11_tensor_broadcasting()
+EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
{
CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
@@ -191,4 +320,12 @@ void test_cxx11_tensor_broadcasting()
CALL_SUBTEST(test_static_broadcasting<RowMajor>());
CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
+ CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
new file mode 100644
index 000000000..72cb62fd5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -0,0 +1,354 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+// Functions used to compare the TensorMap implementation on the device with
+// the equivalent on the host
+namespace cl {
+namespace sycl {
+template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
+template <typename T> T square(T x) { return x * x; }
+template <typename T> T cube(T x) { return x * x * x; }
+template <typename T> T inverse(T x) { return T(1) / x; }
+template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
+template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
+}
+}
+
+struct EqualAssignement {
+ template <typename Lhs, typename Rhs>
+ void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
+};
+
+struct PlusEqualAssignement {
+ template <typename Lhs, typename Rhs>
+ void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; }
+};
+
+template <typename DataType, int DataLayout,
+ typename Assignement, typename Operator>
+void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ Operator op;
+ Assignement asgn;
+ {
+ /* Assignement(out, Operator(in)) */
+ Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ in = in.random() + DataType(0.01);
+ out = out.random() + DataType(0.01);
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data = static_cast<DataType *>(
+ sycl_device.allocate(in.size() * sizeof(DataType)));
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+ (in.size()) * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+ (out.size()) * sizeof(DataType));
+ auto device_expr = gpu_out.device(sycl_device);
+ asgn(device_expr, op(gpu));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ DataType ver = reference(i);
+ asgn(ver, op(in(i)));
+ VERIFY_IS_APPROX(out(i), ver);
+ }
+ sycl_device.deallocate(gpu_data);
+ sycl_device.deallocate(gpu_data_out);
+ }
+ {
+ /* Assignement(out, Operator(out)) */
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ out = out.random() + DataType(0.01);
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),
+ (out.size()) * sizeof(DataType));
+ auto device_expr = gpu_out.device(sycl_device);
+ asgn(device_expr, op(gpu_out));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ DataType ver = reference(i);
+ asgn(ver, op(reference(i)));
+ VERIFY_IS_APPROX(out(i), ver);
+ }
+ sycl_device.deallocate(gpu_data_out);
+ }
+}
+
+#define DECLARE_UNARY_STRUCT(FUNC) \
+ struct op_##FUNC { \
+ template <typename T> \
+ auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \
+ return cl::sycl::FUNC(x); \
+ } \
+ template <typename T> \
+ auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
+ return x.FUNC(); \
+ } \
+ };
+
+DECLARE_UNARY_STRUCT(abs)
+DECLARE_UNARY_STRUCT(sqrt)
+DECLARE_UNARY_STRUCT(rsqrt)
+DECLARE_UNARY_STRUCT(square)
+DECLARE_UNARY_STRUCT(cube)
+DECLARE_UNARY_STRUCT(inverse)
+DECLARE_UNARY_STRUCT(tanh)
+DECLARE_UNARY_STRUCT(exp)
+DECLARE_UNARY_STRUCT(expm1)
+DECLARE_UNARY_STRUCT(log)
+DECLARE_UNARY_STRUCT(ceil)
+DECLARE_UNARY_STRUCT(floor)
+DECLARE_UNARY_STRUCT(round)
+DECLARE_UNARY_STRUCT(log1p)
+DECLARE_UNARY_STRUCT(sign)
+DECLARE_UNARY_STRUCT(isnan)
+DECLARE_UNARY_STRUCT(isfinite)
+DECLARE_UNARY_STRUCT(isinf)
+
+template <typename DataType, int DataLayout, typename Assignement>
+void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+#define RUN_UNARY_TEST(FUNC) \
+ test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \
+ op_##FUNC>(sycl_device, tensor_range)
+ RUN_UNARY_TEST(abs);
+ RUN_UNARY_TEST(sqrt);
+ RUN_UNARY_TEST(rsqrt);
+ RUN_UNARY_TEST(square);
+ RUN_UNARY_TEST(cube);
+ RUN_UNARY_TEST(inverse);
+ RUN_UNARY_TEST(tanh);
+ RUN_UNARY_TEST(exp);
+ RUN_UNARY_TEST(expm1);
+ RUN_UNARY_TEST(log);
+ RUN_UNARY_TEST(ceil);
+ RUN_UNARY_TEST(floor);
+ RUN_UNARY_TEST(round);
+ RUN_UNARY_TEST(log1p);
+ RUN_UNARY_TEST(sign);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ /* out = op(in) */
+ Operator op;
+ Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range);
+ Tensor<bool, 3, DataLayout, int64_t> out(tensor_range);
+ in = in.random() + DataType(0.01);
+ DataType *gpu_data = static_cast<DataType *>(
+ sycl_device.allocate(in.size() * sizeof(DataType)));
+ bool *gpu_data_out =
+ static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range);
+ TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data, in.data(),
+ (in.size()) * sizeof(DataType));
+ gpu_out.device(sycl_device) = op(gpu);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(bool));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ VERIFY_IS_EQUAL(out(i), op(in(i)));
+ }
+ sycl_device.deallocate(gpu_data);
+ sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout>
+void test_unary_builtins(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ test_unary_builtins_for_assignement<DataType, DataLayout,
+ PlusEqualAssignement>(sycl_device, tensor_range);
+ test_unary_builtins_for_assignement<DataType, DataLayout,
+ EqualAssignement>(sycl_device, tensor_range);
+ test_unary_builtins_return_bool<DataType, DataLayout,
+ op_isnan>(sycl_device, tensor_range);
+ test_unary_builtins_return_bool<DataType, DataLayout,
+ op_isfinite>(sycl_device, tensor_range);
+ test_unary_builtins_return_bool<DataType, DataLayout,
+ op_isinf>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
+ int64_t sizeDim1 = 10;
+ int64_t sizeDim2 = 10;
+ int64_t sizeDim3 = 10;
+ array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+
+ test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+ test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ /* out = op(in_1, in_2) */
+ Operator op;
+ Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ in_1 = in_1.random() + DataType(0.01);
+ in_2 = in_2.random() + DataType(0.01);
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data_1 = static_cast<DataType *>(
+ sycl_device.allocate(in_1.size() * sizeof(DataType)));
+ DataType *gpu_data_2 = static_cast<DataType *>(
+ sycl_device.allocate(in_2.size() * sizeof(DataType)));
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+ (in_1.size()) * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),
+ (in_2.size()) * sizeof(DataType));
+ gpu_out.device(sycl_device) = op(gpu_1, gpu_2);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i)));
+ }
+ sycl_device.deallocate(gpu_data_1);
+ sycl_device.deallocate(gpu_data_2);
+ sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, int DataLayout, typename Operator>
+void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ /* out = op(in_1, 2) */
+ Operator op;
+ const DataType arg2(2);
+ Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range);
+ Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range);
+ in_1 = in_1.random();
+ Tensor<DataType, 3, DataLayout, int64_t> reference(out);
+ DataType *gpu_data_1 = static_cast<DataType *>(
+ sycl_device.allocate(in_1.size() * sizeof(DataType)));
+ DataType *gpu_data_out = static_cast<DataType *>(
+ sycl_device.allocate(out.size() * sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range);
+ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),
+ (in_1.size()) * sizeof(DataType));
+ gpu_out.device(sycl_device) = op(gpu_1, arg2);
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,
+ (out.size()) * sizeof(DataType));
+ for (int64_t i = 0; i < out.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), op(in_1(i), arg2));
+ }
+ sycl_device.deallocate(gpu_data_1);
+ sycl_device.deallocate(gpu_data_out);
+}
+
+#define DECLARE_BINARY_STRUCT(FUNC) \
+ struct op_##FUNC { \
+ template <typename T1, typename T2> \
+ auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \
+ return cl::sycl::FUNC(x, y); \
+ } \
+ template <typename T1, typename T2> \
+ auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \
+ return x.FUNC(y); \
+ } \
+ };
+
+DECLARE_BINARY_STRUCT(cwiseMax)
+DECLARE_BINARY_STRUCT(cwiseMin)
+
+#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
+ struct op_##NAME { \
+ template <typename T1, typename T2> \
+ auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \
+ return x OPERATOR y; \
+ } \
+ };
+
+DECLARE_BINARY_STRUCT_OP(plus, +)
+DECLARE_BINARY_STRUCT_OP(minus, -)
+DECLARE_BINARY_STRUCT_OP(times, *)
+DECLARE_BINARY_STRUCT_OP(divide, /)
+DECLARE_BINARY_STRUCT_OP(modulo, %)
+
+template <typename DataType, int DataLayout>
+void test_binary_builtins(const Eigen::SyclDevice& sycl_device,
+ const array<int64_t, 3>& tensor_range) {
+ test_binary_builtins_func<DataType, DataLayout,
+ op_cwiseMax>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_cwiseMin>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_plus>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_minus>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_times>(sycl_device, tensor_range);
+ test_binary_builtins_func<DataType, DataLayout,
+ op_divide>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+ int64_t sizeDim1 = 10;
+ int64_t sizeDim2 = 10;
+ int64_t sizeDim3 = 10;
+ array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+ test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range);
+ test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range);
+}
+
+template <typename DataType>
+static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
+ int64_t sizeDim1 = 10;
+ int64_t sizeDim2 = 10;
+ int64_t sizeDim3 = 10;
+ array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}};
+ test_binary_builtins_fixed_arg2<DataType, RowMajor,
+ op_modulo>(sycl_device, tensor_range);
+ test_binary_builtins_fixed_arg2<DataType, ColMajor,
+ op_modulo>(sycl_device, tensor_range);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ QueueInterface queueInterface(device);
+ Eigen::SyclDevice sycl_device(&queueInterface);
+ CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+ CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+ CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
index 88c233994..97923d15f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu
@@ -9,20 +9,17 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
-void test_cuda_conversion() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
@@ -75,8 +72,8 @@ void test_fallback_conversion() {
}
-void test_cxx11_tensor_cast_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu)
{
- CALL_SUBTEST(test_cuda_conversion());
+ CALL_SUBTEST(test_gpu_conversion());
CALL_SUBTEST(test_fallback_conversion());
}
diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp
index 3c6d0d2ff..45456f3ef 100644
--- a/unsupported/test/cxx11_tensor_casts.cpp
+++ b/unsupported/test/cxx11_tensor_casts.cpp
@@ -8,6 +8,7 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
+#include "random_without_cast_overflow.h"
#include <Eigen/CXX11/Tensor>
@@ -104,12 +105,82 @@ static void test_small_to_big_type_cast()
}
}
+template <typename FromType, typename ToType>
+static void test_type_cast() {
+ Tensor<FromType, 2> ftensor(100, 200);
+ // Generate random values for a valid cast.
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 200; ++j) {
+ ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value();
+ }
+ }
+
+ Tensor<ToType, 2> ttensor(100, 200);
+ ttensor = ftensor.template cast<ToType>();
+
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 200; ++j) {
+ const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j));
+ VERIFY_IS_APPROX(ttensor(i, j), ref);
+ }
+ }
+}
+
+template<typename Scalar, typename EnableIf = void>
+struct test_cast_runner {
+ static void run() {
+ test_type_cast<Scalar, bool>();
+ test_type_cast<Scalar, int8_t>();
+ test_type_cast<Scalar, int16_t>();
+ test_type_cast<Scalar, int32_t>();
+ test_type_cast<Scalar, int64_t>();
+ test_type_cast<Scalar, uint8_t>();
+ test_type_cast<Scalar, uint16_t>();
+ test_type_cast<Scalar, uint32_t>();
+ test_type_cast<Scalar, uint64_t>();
+ test_type_cast<Scalar, half>();
+ test_type_cast<Scalar, bfloat16>();
+ test_type_cast<Scalar, float>();
+ test_type_cast<Scalar, double>();
+ test_type_cast<Scalar, std::complex<float>>();
+ test_type_cast<Scalar, std::complex<double>>();
+ }
+};
+
+// Only certain types allow cast from std::complex<>.
+template<typename Scalar>
+struct test_cast_runner<Scalar, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> {
+ static void run() {
+ test_type_cast<Scalar, half>();
+ test_type_cast<Scalar, bfloat16>();
+ test_type_cast<Scalar, std::complex<float>>();
+ test_type_cast<Scalar, std::complex<double>>();
+ }
+};
+
-void test_cxx11_tensor_casts()
+EIGEN_DECLARE_TEST(cxx11_tensor_casts)
{
- CALL_SUBTEST(test_simple_cast());
- CALL_SUBTEST(test_vectorized_cast());
- CALL_SUBTEST(test_float_to_int_cast());
- CALL_SUBTEST(test_big_to_small_type_cast());
- CALL_SUBTEST(test_small_to_big_type_cast());
+ CALL_SUBTEST(test_simple_cast());
+ CALL_SUBTEST(test_vectorized_cast());
+ CALL_SUBTEST(test_float_to_int_cast());
+ CALL_SUBTEST(test_big_to_small_type_cast());
+ CALL_SUBTEST(test_small_to_big_type_cast());
+
+ CALL_SUBTEST(test_cast_runner<bool>::run());
+ CALL_SUBTEST(test_cast_runner<int8_t>::run());
+ CALL_SUBTEST(test_cast_runner<int16_t>::run());
+ CALL_SUBTEST(test_cast_runner<int32_t>::run());
+ CALL_SUBTEST(test_cast_runner<int64_t>::run());
+ CALL_SUBTEST(test_cast_runner<uint8_t>::run());
+ CALL_SUBTEST(test_cast_runner<uint16_t>::run());
+ CALL_SUBTEST(test_cast_runner<uint32_t>::run());
+ CALL_SUBTEST(test_cast_runner<uint64_t>::run());
+ CALL_SUBTEST(test_cast_runner<half>::run());
+ CALL_SUBTEST(test_cast_runner<bfloat16>::run());
+ CALL_SUBTEST(test_cast_runner<float>::run());
+ CALL_SUBTEST(test_cast_runner<double>::run());
+ CALL_SUBTEST(test_cast_runner<std::complex<float>>::run());
+ CALL_SUBTEST(test_cast_runner<std::complex<double>>::run());
+
}
diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp
index 1832dec8b..922274462 100644
--- a/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/unsupported/test/cxx11_tensor_chipping.cpp
@@ -43,7 +43,7 @@ static void test_simple_chip()
VERIFY_IS_EQUAL(chip2.dimension(2), 7);
VERIFY_IS_EQUAL(chip2.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
+ for (int j = 0; j < 5; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -75,7 +75,7 @@ static void test_simple_chip()
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
+ for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
}
}
@@ -126,7 +126,7 @@ static void test_dynamic_chip()
VERIFY_IS_EQUAL(chip2.dimension(2), 7);
VERIFY_IS_EQUAL(chip2.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
+ for (int j = 0; j < 5; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -158,7 +158,7 @@ static void test_dynamic_chip()
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 7; ++l) {
+ for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
}
}
@@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major()
VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
}
-void test_cxx11_tensor_chipping()
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping)
{
CALL_SUBTEST(test_simple_chip<ColMajor>());
CALL_SUBTEST(test_simple_chip<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
new file mode 100644
index 000000000..1e7093104
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -0,0 +1,623 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+ Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+ tensor.setRandom();
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
+ sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+ VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+ VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+ VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim2; ++i) {
+ for (IndexType j = 0; j < sizeDim3; ++j) {
+ for (IndexType k = 0; k < sizeDim4; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+ const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+ DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+ gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
+ sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+ VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+ VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim3; ++j) {
+ for (IndexType k = 0; k < sizeDim4; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+ const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+ DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+ gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
+ sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+ VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim4; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+ const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+ DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+ gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
+ sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+ }
+ }
+ }
+ }
+
+
+ array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+ const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+ DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+ gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
+ sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim4; ++l) {
+ VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_chip1);
+ sycl_device.deallocate(gpu_data_chip2);
+ sycl_device.deallocate(gpu_data_chip3);
+ sycl_device.deallocate(gpu_data_chip4);
+ sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+ Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+
+ tensor.setRandom();
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
+ sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
+ VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
+ VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
+ VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim2; ++i) {
+ for (IndexType j = 0; j < sizeDim3; ++j) {
+ for (IndexType k = 0; k < sizeDim4; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
+ const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
+ DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+ gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
+ sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
+ VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
+ VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim3; ++j) {
+ for (IndexType k = 0; k < sizeDim4; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
+ const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
+ DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
+
+ gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
+ sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
+ VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim4; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
+ const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
+ DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
+
+ gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
+ sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim5; ++l) {
+ VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
+ }
+ }
+ }
+ }
+
+
+ array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
+ const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
+ DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
+
+ gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
+ sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
+
+ VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim4; ++l) {
+ VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_chip1);
+ sycl_device.deallocate(gpu_data_chip2);
+ sycl_device.deallocate(gpu_data_chip3);
+ sycl_device.deallocate(gpu_data_chip4);
+ sycl_device.deallocate(gpu_data_chip5);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+ Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+
+ Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
+ tensor.setRandom();
+ tensor1.setRandom();
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+ DataType* gpu_data_tensor1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
+
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
+
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
+ gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
+ sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
+
+ for (int i = 0; i < sizeDim2; ++i) {
+ for (int j = 0; j < sizeDim3; ++j) {
+ for (int k = 0; k < sizeDim4; ++k) {
+ for (int l = 0; l < sizeDim5; ++l) {
+ float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
+ VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
+ }
+ }
+ }
+ }
+
+ array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
+ Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
+ Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
+ tensor2.setRandom();
+ const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
+ DataType* gpu_data_tensor2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+ DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
+ TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
+ gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
+ sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
+
+ for (int i = 0; i < sizeDim2; ++i) {
+ for (int j = 0; j < sizeDim4; ++j) {
+ for (int k = 0; k < sizeDim5; ++k) {
+ float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
+ VERIFY_IS_EQUAL(chip2(i,j,k), expected);
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_tensor1);
+ sycl_device.deallocate(gpu_data_chip1);
+ sycl_device.deallocate(gpu_data_tensor2);
+ sycl_device.deallocate(gpu_data_chip2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+
+ Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
+ input1.setRandom();
+ input2.setRandom();
+
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
+ std::cout << tensorBuffSize << " , "<< input2TensorBuffSize << std::endl;
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_input1 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_input2 = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
+
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
+ gpu_tensor.device(sycl_device)=gpu_input1;
+ sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
+ gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
+ for (int l = 0; l < sizeDim4; ++l) {
+ for (int m = 0; m < sizeDim5; ++m) {
+ if (i != 1) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+ }
+ }
+ }
+ }
+ }
+ }
+
+ gpu_tensor.device(sycl_device)=gpu_input1;
+ array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
+ input3.setRandom();
+
+ const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
+ DataType* gpu_data_input3 = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
+ gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k <sizeDim3; ++k) {
+ for (int l = 0; l < sizeDim4; ++l) {
+ for (int m = 0; m < sizeDim5; ++m) {
+ if (j != 1) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+ }
+ }
+ }
+ }
+ }
+ }
+
+ gpu_tensor.device(sycl_device)=gpu_input1;
+ array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
+ input4.setRandom();
+
+ const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
+ DataType* gpu_data_input4 = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
+ gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k <sizeDim3; ++k) {
+ for (int l = 0; l < sizeDim4; ++l) {
+ for (int m = 0; m < sizeDim5; ++m) {
+ if (k != 3) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+ }
+ }
+ }
+ }
+ }
+ }
+
+ gpu_tensor.device(sycl_device)=gpu_input1;
+ array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
+ Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
+ input5.setRandom();
+
+ const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
+ DataType* gpu_data_input5 = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
+ gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k <sizeDim3; ++k) {
+ for (int l = 0; l < sizeDim4; ++l) {
+ for (int m = 0; m < sizeDim5; ++m) {
+ if (l != 4) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+ }
+ }
+ }
+ }
+ }
+ }
+ gpu_tensor.device(sycl_device)=gpu_input1;
+ array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
+ input6.setRandom();
+
+ const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
+ DataType* gpu_data_input6 = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
+ gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k <sizeDim3; ++k) {
+ for (int l = 0; l < sizeDim4; ++l) {
+ for (int m = 0; m < sizeDim5; ++m) {
+ if (m != 5) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ gpu_tensor.device(sycl_device)=gpu_input1;
+ Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
+ input7.setRandom();
+
+ DataType* gpu_data_input7 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
+ gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
+
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k <sizeDim3; ++k) {
+ for (int l = 0; l < sizeDim4; ++l) {
+ for (int m = 0; m < sizeDim5; ++m) {
+ if (i != 0) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_input1);
+ sycl_device.deallocate(gpu_data_input2);
+ sycl_device.deallocate(gpu_data_input3);
+ sycl_device.deallocate(gpu_data_input4);
+ sycl_device.deallocate(gpu_data_input5);
+ sycl_device.deallocate(gpu_data_input6);
+ sycl_device.deallocate(gpu_data_input7);
+
+}
+
+template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
+ test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/
+ test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp
index b1ff8aecb..1a18e07cc 100644
--- a/unsupported/test/cxx11_tensor_comparisons.cpp
+++ b/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -77,7 +77,7 @@ static void test_equality()
}
-void test_cxx11_tensor_comparisons()
+EIGEN_DECLARE_TEST(cxx11_tensor_comparisons)
{
CALL_SUBTEST(test_orderings());
CALL_SUBTEST(test_equality());
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
index 2baf5eaad..99447b21d 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu
@@ -8,12 +8,9 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
@@ -31,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
cudaMalloc((void**)(&d_in2), complex_bytes);
cudaMalloc((void**)(&d_out), complex_bytes);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -51,11 +48,13 @@ void test_cuda_complex_cwise_ops() {
Add = 0,
Sub,
Mul,
- Div
+ Div,
+ Neg,
+ NbOps
};
Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
- for (int op = Add; op <= Div; op++) {
+ for (int op = Add; op < NbOps; op++) {
std::complex<T> expected;
switch (static_cast<CwiseOp>(op)) {
case Add:
@@ -74,6 +73,12 @@ void test_cuda_complex_cwise_ops() {
gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
expected = a / b;
break;
+ case Neg:
+ gpu_out.device(gpu_device) = -gpu_in1;
+ expected = -a;
+ break;
+ case NbOps:
+ break;
}
assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
gpu_device.stream()) == cudaSuccess);
@@ -90,7 +95,7 @@ void test_cuda_complex_cwise_ops() {
}
-void test_cxx11_tensor_complex_cwise_ops()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops)
{
CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu
index d4e111f5d..f8b8ae704 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_gpu.cu
@@ -8,12 +8,9 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_FUNC cxx11_tensor_complex
+
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
@@ -37,7 +34,7 @@ void test_cuda_nullary() {
cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -73,7 +70,7 @@ void test_cuda_nullary() {
static void test_cuda_sum_reductions() {
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024);
@@ -107,10 +104,45 @@ static void test_cuda_sum_reductions() {
gpu_device.deallocate(gpu_out_ptr);
}
+static void test_cuda_mean_reductions() {
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ const int num_rows = internal::random<int>(1024, 5*1024);
+ const int num_cols = internal::random<int>(1024, 5*1024);
+
+ Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+ in.setRandom();
+
+ Tensor<std::complex<float>, 0> full_redux;
+ full_redux = in.mean();
+
+ std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+ std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+ std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+ std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+ gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+ TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+ TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+ out_gpu.device(gpu_device) = in_gpu.mean();
+
+ Tensor<std::complex<float>, 0> full_redux_gpu;
+ gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+ gpu_device.synchronize();
+
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+ gpu_device.deallocate(gpu_in_ptr);
+ gpu_device.deallocate(gpu_out_ptr);
+}
static void test_cuda_product_reductions() {
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024);
@@ -145,9 +177,10 @@ static void test_cuda_product_reductions() {
}
-void test_cxx11_tensor_complex()
+EIGEN_DECLARE_TEST(test_cxx11_tensor_complex)
{
CALL_SUBTEST(test_cuda_nullary());
CALL_SUBTEST(test_cuda_sum_reductions());
+ CALL_SUBTEST(test_cuda_mean_reductions());
CALL_SUBTEST(test_cuda_product_reductions());
}
diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
index 03ef12e63..bb9418d33 100644
--- a/unsupported/test/cxx11_tensor_concatenation.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -50,7 +50,13 @@ static void test_static_dimension_failure()
.reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
.concatenate(right, 0);
Tensor<int, 2, DataLayout> alternative = left
- .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
+ // Clang compiler break with {{{}}} with an ambiguous error on copy constructor
+ // the variadic DSize constructor added for #ifndef EIGEN_EMULATE_CXX11_META_H.
+ // Solution:
+ // either the code should change to
+ // Tensor<int, 2>::Dimensions{{2, 3}}
+ // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}}
+ .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0);
}
template<int DataLayout>
@@ -123,7 +129,7 @@ static void test_concatenation_as_lvalue()
}
-void test_cxx11_tensor_concatenation()
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation)
{
CALL_SUBTEST(test_dimension_failures<ColMajor>());
CALL_SUBTEST(test_dimension_failures<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
new file mode 100644
index 000000000..765991b35
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType leftDim1 = 2;
+ IndexType leftDim2 = 3;
+ IndexType leftDim3 = 1;
+ Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+ IndexType rightDim1 = 2;
+ IndexType rightDim2 = 3;
+ IndexType rightDim3 = 1;
+ Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+
+ //IndexType concatDim1 = 3;
+// IndexType concatDim2 = 3;
+// IndexType concatDim3 = 1;
+ //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+ Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
+ left.setRandom();
+ right.setRandom();
+
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+ sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+ ///
+ Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+ DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+
+ //concatenation = left.concatenate(right, 0);
+ gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
+ sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
+ VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
+ VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType i = 0; i < 2; ++i) {
+ VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
+ }
+ for (IndexType i = 2; i < 4; ++i) {
+ VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
+ }
+ }
+
+ sycl_device.deallocate(gpu_out_data1);
+ Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+ DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+ gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
+ sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
+
+ //concatenation = left.concatenate(right, 1);
+ VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
+ VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
+ VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
+ }
+ for (IndexType j = 3; j < 6; ++j) {
+ VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
+ }
+ }
+ sycl_device.deallocate(gpu_out_data2);
+ Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+ DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+ gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
+ sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
+
+ //concatenation = left.concatenate(right, 2);
+ VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
+ VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
+ VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
+ VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
+ }
+ }
+ sycl_device.deallocate(gpu_out_data3);
+ sycl_device.deallocate(gpu_in1_data);
+ sycl_device.deallocate(gpu_in2_data);
+}
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType leftDim1 = 2;
+ IndexType leftDim2 = 3;
+ Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
+
+ IndexType rightDim1 = 2;
+ IndexType rightDim2 = 3;
+ Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
+
+ IndexType concatDim1 = 4;
+ IndexType concatDim2 = 3;
+ Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+ Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+ Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
+
+ left.setRandom();
+ right.setRandom();
+ result.setRandom();
+
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
+
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
+
+ sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
+
+// t1.concatenate(t2, 0) = result;
+ gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
+ sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ VERIFY_IS_EQUAL(left(i, j), result(i, j));
+ VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
+ }
+ }
+ sycl_device.deallocate(gpu_in1_data);
+ sycl_device.deallocate(gpu_in2_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+
+template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+ test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorConcat_perDevice<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp
index ad9c9da39..9d806ee3c 100644
--- a/unsupported/test/cxx11_tensor_const.cpp
+++ b/unsupported/test/cxx11_tensor_const.cpp
@@ -55,7 +55,7 @@ static void test_assign_of_const_tensor()
}
-void test_cxx11_tensor_const()
+EIGEN_DECLARE_TEST(cxx11_tensor_const)
{
CALL_SUBTEST(test_simple_assign());
CALL_SUBTEST(test_assign_of_const_tensor());
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index dd68430ce..575bdc1f9 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -10,21 +10,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
-void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_gpu_contraction(int m_size, int k_size, int n_size)
{
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
@@ -47,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
float* d_t_right;
float* d_t_result;
- cudaMalloc((void**)(&d_t_left), t_left_bytes);
- cudaMalloc((void**)(&d_t_right), t_right_bytes);
- cudaMalloc((void**)(&d_t_result), t_result_bytes);
+ gpuMalloc((void**)(&d_t_left), t_left_bytes);
+ gpuMalloc((void**)(&d_t_right), t_right_bytes);
+ gpuMalloc((void**)(&d_t_result), t_result_bytes);
- cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -68,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
- cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue;
@@ -81,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
assert(false);
}
- cudaFree((void*)d_t_left);
- cudaFree((void*)d_t_right);
- cudaFree((void*)d_t_result);
+ gpuFree((void*)d_t_left);
+ gpuFree((void*)d_t_right);
+ gpuFree((void*)d_t_result);
}
@@ -111,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
float* d_t_right;
float* d_t_result;
- cudaMalloc((void**)(&d_t_left), t_left_bytes);
- cudaMalloc((void**)(&d_t_right), t_right_bytes);
- cudaMalloc((void**)(&d_t_result), t_result_bytes);
+ gpuMalloc((void**)(&d_t_left), t_left_bytes);
+ gpuMalloc((void**)(&d_t_right), t_right_bytes);
+ gpuMalloc((void**)(&d_t_result), t_result_bytes);
- cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -131,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
- cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
std::cout << "mismatch detected: " << t_result()
@@ -139,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
assert(false);
}
- cudaFree((void*)d_t_left);
- cudaFree((void*)d_t_right);
- cudaFree((void*)d_t_result);
+ gpuFree((void*)d_t_left);
+ gpuFree((void*)d_t_right);
+ gpuFree((void*)d_t_result);
}
template<int DataLayout>
-void test_cuda_contraction_m() {
+void test_gpu_contraction_m() {
for (int k = 32; k < 256; k++) {
- test_cuda_contraction<ColMajor>(k, 128, 128);
- test_cuda_contraction<RowMajor>(k, 128, 128);
+ test_gpu_contraction<ColMajor>(k, 128, 128);
+ test_gpu_contraction<RowMajor>(k, 128, 128);
}
}
template<int DataLayout>
-void test_cuda_contraction_k() {
+void test_gpu_contraction_k() {
for (int k = 32; k < 256; k++) {
- test_cuda_contraction<ColMajor>(128, k, 128);
- test_cuda_contraction<RowMajor>(128, k, 128);
+ test_gpu_contraction<ColMajor>(128, k, 128);
+ test_gpu_contraction<RowMajor>(128, k, 128);
}
}
template<int DataLayout>
-void test_cuda_contraction_n() {
+void test_gpu_contraction_n() {
for (int k = 32; k < 256; k++) {
- test_cuda_contraction<ColMajor>(128, 128, k);
- test_cuda_contraction<RowMajor>(128, 128, k);
+ test_gpu_contraction<ColMajor>(128, 128, k);
+ test_gpu_contraction<RowMajor>(128, 128, k);
}
}
template<int DataLayout>
-void test_cuda_contraction_sizes() {
+void test_gpu_contraction_sizes() {
int m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025};
@@ -188,29 +187,32 @@ void test_cuda_contraction_sizes() {
for (int i = 0; i < 15; i++) {
for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) {
- test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+ test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
}
}
}
}
-void test_cxx11_tensor_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
{
- CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
- CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+ CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+ CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
- CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
- CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+ CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+ CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
- CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
- CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+ CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+ CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
- CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
- CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+ CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+ CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
- CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
- CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+ CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+ CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif
}
diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp
new file mode 100644
index 000000000..fbcc29358
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@@ -0,0 +1,1026 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <algorithm>
+#include <chrono>
+#include <ctime>
+#include <iostream>
+
+#include "main.h"
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void static test_sycl_contraction(const Device &sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size) {
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ // with these dimensions, the output has 300 * 140 elements, which is
+ // more than 30 * 1024, which is the number of threads in blocks on
+ // a 15 SM GK110 GPU
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+ Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+ Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+ Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_result(d_t_result, result_dims);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+ t_result_bytes);
+
+ t_result = t_left.contract(t_right, dims);
+
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(i) - t_result_gpu(i)))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+ error_threshold)) {
+ continue;
+ }
+
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void test_sycl_contraction_m(const Device &sycl_device) {
+ for (IndexType k = 32; k < 256; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128,
+ 128);
+ }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void test_sycl_contraction_k(const Device &sycl_device) {
+ for (IndexType k = 32; k < 256; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k,
+ 128);
+ }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void test_sycl_contraction_n(const Device &sycl_device) {
+ for (IndexType k = 32; k < 256; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128,
+ 128, k);
+ }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void test_sycl_contraction_sizes(const Device &sycl_device) {
+ IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
+ 257, 511, 512, 513, 1023, 1024, 1025};
+
+ IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255,
+ 257, 511, 512, 513, 1023, 1024, 1025};
+
+ IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129,
+ 255, 257, 511, 512, 513, 1023, 1024, 1025};
+
+ for (IndexType i = 0; i < 15; i++) {
+ for (IndexType j = 0; j < 15; j++) {
+ for (IndexType k = 0; k < 17; k++) {
+ test_sycl_contraction<DataLayout, DataType, IndexType>(
+ sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+ }
+ }
+ }
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size) {
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+ Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+ Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+ Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ // Allocate buffers twice as big to check for invalid read and write
+ auto padded_left_size = 2 * t_left.size();
+ auto padded_right_size = 2 * t_right.size();
+ auto padded_result_size = 2 * t_result.size();
+
+ std::size_t t_left_bytes = padded_left_size * sizeof(DataType);
+ std::size_t t_right_bytes = padded_right_size * sizeof(DataType);
+ std::size_t t_result_bytes = padded_result_size * sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ // TensorMaps are still of the same size than the Tensors
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_result(d_t_result, result_dims);
+
+ // Write nan after the actual buffer to propagate nans everywhere in case of
+ // invalid reads
+ DataType nan = std::numeric_limits<DataType>::quiet_NaN();
+ auto host_left_data = new DataType[padded_left_size];
+ std::copy_n(t_left.data(), t_left.size(), host_left_data);
+ std::fill_n(host_left_data + t_left.size(), t_left.size(), nan);
+ auto host_right_data = new DataType[padded_right_size];
+ std::copy_n(t_right.data(), t_right.size(), host_right_data);
+ std::fill_n(host_right_data + t_right.size(), t_right.size(), nan);
+ auto host_result_data = new DataType[padded_result_size];
+ std::fill_n(host_result_data, padded_result_size, nan);
+
+ sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes);
+ sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes);
+
+ t_result = t_left.contract(t_right, dims);
+
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(i) - host_result_data[i]))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), host_result_data[i],
+ error_threshold)) {
+ continue;
+ }
+ if (std::isnan(host_result_data[i])) {
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", invalid read detected at IndexType " << i << ": "
+ << t_result(i) << " vs " << host_result_data[i] << std::endl;
+ } else {
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", mismatch detected at IndexType " << i << ": "
+ << t_result(i) << " vs " << host_result_data[i] << std::endl;
+ }
+ VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+ }
+ // Make sure that the rest of the result is still nans
+ for (IndexType i = t_result.size(); i < padded_result_size; i++) {
+ if (std::isnan(host_result_data[i])) {
+ continue;
+ }
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", invalid write detected at IndexType " << i << ": "
+ << host_result_data[i] << std::endl;
+ VERIFY_IS_APPROX(host_result_data[i], t_result(i));
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+
+ delete[] host_left_data;
+ delete[] host_right_data;
+ delete[] host_result_data;
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size,
+ IndexType n_size) {
+ // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size <<
+ // ")" << std::endl;
+ // with these dimensions, the output has 300 * 140 elements, which is
+ // more than 30 * 1024, which is the number of threads in blocks on
+ // a 15 SM GK110 GPU
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+ Tensor<DataType, 0, DataLayout, IndexType> t_result;
+ Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
+ Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+ Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+ Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>>
+ gpu_t_result(d_t_result);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+ t_result_bytes);
+
+ t_result = t_left.contract(t_right, dims);
+
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result() - t_result_gpu()))) > error_threshold &&
+ !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
+ std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size
+ << " : mismatch detected: " << t_result() << " vs "
+ << t_result_gpu() << std::endl;
+ VERIFY_IS_APPROX(t_result_gpu(), t_result());
+ }
+
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void contraction_batch(const Device &sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size, IndexType m_batch,
+ IndexType start, IndexType limit) {
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ typedef Eigen::array<IndexType, 3> TensorDim;
+ typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType;
+ TensorDim left_dims = {{m_batch, k_size, m_size}};
+ TensorDim right_dims = {{m_batch, n_size, k_size}};
+ TensorDim res_dims = {{m_batch, m_size, n_size}};
+ Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}};
+
+ TensorType t_left(left_dims);
+ TensorType t_right(right_dims);
+ TensorType t_result_gpu(res_dims);
+ TensorType t_result(res_dims);
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+ for (int i = start; i < limit; ++i) {
+ auto x = gpu_t_left.template chip<0>(i);
+ auto y = gpu_t_right.template chip<0>(i);
+ auto z = gpu_t_result.template chip<0>(i);
+ z.device(sycl_device) = x.contract(y, contract_pairs);
+ }
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+ t_result_bytes);
+
+ for (int i = start; i < limit; ++i) {
+ auto x = t_left.template chip<0>(i);
+ auto y = t_right.template chip<0>(i);
+ auto z = t_result.template chip<0>(i);
+ z = x.contract(y, contract_pairs);
+ }
+
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(i) - t_result_gpu(i)))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+ error_threshold)) {
+ continue;
+ }
+ std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size) {
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+ Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+ Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+ Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_result(d_t_result, res_dims);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+ t_result_bytes);
+
+ t_result = t_left.contract(t_right, dims);
+
+ for (IndexType j = 0; j < m_size; j++) {
+ for (IndexType i = 0; i < n_size; i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i),
+ error_threshold)) {
+ continue;
+ }
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", mismatch detected at IndexType m: " << j << " n: " << i
+ << " CPU : " << t_result(j, i)
+ << " vs SYCL:" << t_result_gpu(j, i) << std::endl;
+ VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i));
+ }
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size) {
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+ Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+ Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+ Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_result(d_t_result, res_dims);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+ t_result_bytes);
+
+ t_result = t_left.contract(t_right, dims);
+
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(i) - t_result_gpu(i)))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+ error_threshold)) {
+ continue;
+ }
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+ VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template <int DataLayout, typename DataType, typename IndexType,
+ typename Device>
+void contraction_both_transposed(const Device &sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size) {
+ typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair
+ DimPair;
+ static const DataType error_threshold = DataType(1e-4);
+ Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}};
+ Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}};
+ Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}};
+ Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
+
+ t_left.setRandom();
+ t_right.setRandom();
+
+ std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
+ std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+ DataType *d_t_left =
+ static_cast<DataType *>(sycl_device.allocate(t_left_bytes));
+ DataType *d_t_right =
+ static_cast<DataType *>(sycl_device.allocate(t_right_bytes));
+ DataType *d_t_result =
+ static_cast<DataType *>(sycl_device.allocate(t_result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_left(d_t_left, left_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_right(d_t_right, right_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>>
+ gpu_t_result(d_t_result, res_dims);
+
+ sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes);
+ sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result,
+ t_result_bytes);
+
+ t_result = t_left.contract(t_right, dims);
+
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(i) - t_result_gpu(i)))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+ error_threshold)) {
+ continue;
+ }
+ std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size
+ << ", mismatch detected at IndexType " << i << ": " << t_result(i)
+ << " vs " << t_result_gpu(i) << std::endl;
+
+ VERIFY_IS_APPROX(t_result_gpu(i), t_result(i));
+ }
+ sycl_device.deallocate(d_t_left);
+ sycl_device.deallocate(d_t_right);
+ sycl_device.deallocate(d_t_result);
+}
+
+template <typename Dev>
+void inline tensorOutofBound(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Test out of bound for Tensor-Tensor
+ test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+ 1024);
+ test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+ 4096);
+ test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024,
+ 2048);
+ test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+ 1024);
+ test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024,
+ 784);
+ test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024,
+ 10);
+ test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096,
+ 513);
+ test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024,
+ 783);
+ test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048,
+ 784);
+ test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024,
+ 11);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "tensor out of bound tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Tensor Tensor Contraction
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128,
+ 128);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128,
+ 128);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "tensor tensor tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_m(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Tensor Tensor Contraction
+ test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device);
+ test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "tensor tensor tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_n(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Tensor Tensor Contraction
+ test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device);
+ test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "tensor tensor tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_k(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device);
+ test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "tensor tensor tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorTensor_sizes(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Tensor Tensor Contraction
+ test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device);
+ test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "tensor tensor tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+template <typename Dev>
+void inline vectorVector(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // VECTOR-VECTOR
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1,
+ 1025);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1,
+ 1025);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1,
+ 1024);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1,
+ 1024);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1,
+ 1023);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1,
+ 1023);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "contracted tensor tests finished computation at "
+ << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline vectorTensor(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Vector-Tensor
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025,
+ 1025);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025,
+ 1025);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024,
+ 1024);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024,
+ 1024);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023,
+ 1023);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023,
+ 1023);
+
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097,
+ 4097);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097,
+ 4097);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096,
+ 4096);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096,
+ 4096);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095,
+ 4095);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095,
+ 4095);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816,
+ 32);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorVector(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Matrix-Vector
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025,
+ 1);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025,
+ 1);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024,
+ 1);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024,
+ 1);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+ 1);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023,
+ 1);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197,
+ 1);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097,
+ 1);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096,
+ 1);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196,
+ 1);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+ 1);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095,
+ 1);
+// If the GEMV disabled it will creates one kernel to calculate the contraction.
+// Therefore the acumuation of float number will overflow the precision
+// threshold for float and cause the test to fail. While it the GMV multiple
+// kernel will be created and each one run the overflow of accumutation breaks
+// among the kernels.
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032,
+ 1);
+#endif
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensorScalar(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // SCALAR Contraction
+ test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+ test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127);
+ test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+ test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128);
+ test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+ test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129);
+
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_row(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Tensor Tensor Contraction
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073,
+ 257);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072,
+ 256);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073,
+ 16);
+ test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072,
+ 17);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline skinnyTensor_col(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+ // Tensor Tensor Contraction
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073,
+ 257);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072,
+ 256);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073,
+ 16);
+ test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072,
+ 17);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_batch_per_device(const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+
+ contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+ 0, 4);
+ contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4,
+ 0, 4);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_lhs_transposed_per_device(
+ const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4,
+ 8);
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+ 32);
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+ 64);
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784,
+ 2048, 1024);
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+ 10, 1024);
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+ 1024, 1024);
+ contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+ 4096, 1024);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_rhs_transposed_per_device(
+ const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4,
+ 16);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+ 17);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+ 32);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16,
+ 64);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10,
+ 1024, 1024);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024,
+ 1024, 4096);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096,
+ 1024, 2048);
+ contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048,
+ 1024, 784);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+template <typename Dev>
+void inline tensor_contraction_both_transposed_per_device(
+ const Dev &sycl_device) {
+ typedef float DataType;
+ typedef int64_t IndexType;
+ std::chrono::time_point<std::chrono::system_clock> start, end;
+ start = std::chrono::system_clock::now();
+
+ contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5,
+ 17);
+ contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8,
+ 32);
+ contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64,
+ 16, 64);
+ end = std::chrono::system_clock::now();
+ std::chrono::duration<double> elapsed_seconds = end - start;
+ std::time_t end_time = std::chrono::system_clock::to_time_t(end);
+ std::cout << "finished computation at " << std::ctime(&end_time)
+ << "elapsed time: " << elapsed_seconds.count() << "s\n";
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) {
+ for (const auto &device : Eigen::get_sycl_supported_devices()) {
+ std::cout << "Running on "
+ << device.template get_info<cl::sycl::info::device::name>()
+ << std::endl;
+ QueueInterface queueInterface(device);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ CALL_SUBTEST_1(tensorOutofBound(sycl_device));
+ CALL_SUBTEST_2(tensorTensor(sycl_device));
+ CALL_SUBTEST_2(tensorTensor_m(sycl_device));
+ CALL_SUBTEST_2(tensorTensor_n(sycl_device));
+ CALL_SUBTEST_2(tensorTensor_k(sycl_device));
+ CALL_SUBTEST_2(tensorTensor_sizes(sycl_device));
+ CALL_SUBTEST_3(vectorVector(sycl_device));
+ CALL_SUBTEST_4(vectorTensor(sycl_device));
+ CALL_SUBTEST_5(tensorVector(sycl_device));
+ CALL_SUBTEST_6(tensorScalar(sycl_device));
+ CALL_SUBTEST_7(skinnyTensor_row(sycl_device));
+ CALL_SUBTEST_7(skinnyTensor_col(sycl_device));
+ CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device));
+ CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device));
+ CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device));
+ CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index ace97057f..3b5c6a13c 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -471,7 +471,8 @@ static void test_tensor_product()
mat1.setRandom();
mat2.setRandom();
- Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+ Eigen::array<DimPair, 0> dims;
+ Tensor<float, 4, DataLayout> result = mat1.contract(mat2, dims);
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_EQUAL(result.dimension(1), 3);
@@ -510,36 +511,91 @@ static void test_const_inputs()
VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
}
-void test_cxx11_tensor_contraction()
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+ template <typename Index, typename Scalar>
+ EIGEN_ALWAYS_INLINE void operator()(
+ const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+ const TensorContractionParams&, Index, Index, Index num_rows,
+ Index num_cols) const {
+ for (int i = 0; i < num_rows; ++i) {
+ for (int j = 0; j < num_cols; ++j) {
+ output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+ }
+ }
+ }
+};
+
+template <int DataLayout>
+static void test_large_contraction_with_output_kernel() {
+ Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+ Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+ Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in mat4 to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 1500, 248);
+ MapXf m_right(t_right.data(), 248, 1400);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+ // compute results by separate methods
+ t_result = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+ m_result = m_left * m_right;
+
+ for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_contraction)
{
- CALL_SUBTEST(test_evals<ColMajor>());
- CALL_SUBTEST(test_evals<RowMajor>());
- CALL_SUBTEST(test_scalar<ColMajor>());
- CALL_SUBTEST(test_scalar<RowMajor>());
- CALL_SUBTEST(test_multidims<ColMajor>());
- CALL_SUBTEST(test_multidims<RowMajor>());
- CALL_SUBTEST(test_holes<ColMajor>());
- CALL_SUBTEST(test_holes<RowMajor>());
- CALL_SUBTEST(test_full_redux<ColMajor>());
- CALL_SUBTEST(test_full_redux<RowMajor>());
- CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
- CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
- CALL_SUBTEST(test_expr<ColMajor>());
- CALL_SUBTEST(test_expr<RowMajor>());
- CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
- CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
- CALL_SUBTEST(test_consistency<ColMajor>());
- CALL_SUBTEST(test_consistency<RowMajor>());
- CALL_SUBTEST(test_large_contraction<ColMajor>());
- CALL_SUBTEST(test_large_contraction<RowMajor>());
- CALL_SUBTEST(test_matrix_vector<ColMajor>());
- CALL_SUBTEST(test_matrix_vector<RowMajor>());
- CALL_SUBTEST(test_tensor_vector<ColMajor>());
- CALL_SUBTEST(test_tensor_vector<RowMajor>());
- CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
- CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
- CALL_SUBTEST(test_tensor_product<ColMajor>());
- CALL_SUBTEST(test_tensor_product<RowMajor>());
- CALL_SUBTEST(test_const_inputs<ColMajor>());
- CALL_SUBTEST(test_const_inputs<RowMajor>());
+ CALL_SUBTEST_1(test_evals<ColMajor>());
+ CALL_SUBTEST_1(test_evals<RowMajor>());
+ CALL_SUBTEST_1(test_scalar<ColMajor>());
+ CALL_SUBTEST_1(test_scalar<RowMajor>());
+ CALL_SUBTEST_2(test_multidims<ColMajor>());
+ CALL_SUBTEST_2(test_multidims<RowMajor>());
+ CALL_SUBTEST_2(test_holes<ColMajor>());
+ CALL_SUBTEST_2(test_holes<RowMajor>());
+ CALL_SUBTEST_3(test_full_redux<ColMajor>());
+ CALL_SUBTEST_3(test_full_redux<RowMajor>());
+ CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>());
+ CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>());
+ CALL_SUBTEST_4(test_expr<ColMajor>());
+ CALL_SUBTEST_4(test_expr<RowMajor>());
+ CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>());
+ CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>());
+ CALL_SUBTEST_5(test_consistency<ColMajor>());
+ CALL_SUBTEST_5(test_consistency<RowMajor>());
+ CALL_SUBTEST_5(test_large_contraction<ColMajor>());
+ CALL_SUBTEST_5(test_large_contraction<RowMajor>());
+ CALL_SUBTEST_6(test_matrix_vector<ColMajor>());
+ CALL_SUBTEST_6(test_matrix_vector<RowMajor>());
+ CALL_SUBTEST_6(test_tensor_vector<ColMajor>());
+ CALL_SUBTEST_6(test_tensor_vector<RowMajor>());
+ CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>());
+ CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>());
+ CALL_SUBTEST_7(test_tensor_product<ColMajor>());
+ CALL_SUBTEST_7(test_tensor_product<RowMajor>());
+ CALL_SUBTEST_8(test_const_inputs<ColMajor>());
+ CALL_SUBTEST_8(test_const_inputs<RowMajor>());
+ CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>());
+
+ // Force CMake to split this test.
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8
+
}
diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp
index e3d4675eb..c3688f678 100644
--- a/unsupported/test/cxx11_tensor_convolution.cpp
+++ b/unsupported/test/cxx11_tensor_convolution.cpp
@@ -25,7 +25,8 @@ static void test_evals()
Tensor<float, 2, DataLayout> result(2,3);
result.setZero();
- Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
+ Eigen::array<Tensor<float, 2>::Index, 1> dims3;
+ dims3[0] = 0;
typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
@@ -136,7 +137,7 @@ static void test_strides() {
input(12)*kernel(2)));
}
-void test_cxx11_tensor_convolution()
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution)
{
CALL_SUBTEST(test_evals<ColMajor>());
CALL_SUBTEST(test_evals<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
new file mode 100644
index 000000000..3954c8a28
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <iomanip>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+static const float error_threshold =1e-4f;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType indim0 =53;
+ IndexType indim1= 55;
+ IndexType indim2= 51;
+ IndexType outdim0=50;
+ IndexType outdim1=55;
+ IndexType outdim2=51;
+ Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+ Eigen::array<IndexType, 1> kernel_dims = {{4}};
+ Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+ Eigen::array<IndexType, 1> dims3{{0}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+ result_host.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+ for(IndexType j=0; j< outdim1; j++ ){
+ for(IndexType k=0; k< outdim2; k++ ){
+ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+ std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
+ assert(false);
+ }
+ }
+ }
+}
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType indim0 =53;
+ IndexType indim1= 55;
+ IndexType indim2= 51;
+ IndexType outdim0=50;
+ IndexType outdim1=51;
+ IndexType outdim2=51;
+ Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+ Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
+ Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+ Eigen::array<IndexType, 2> dims3{{0,1}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+ result_host.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+ for(IndexType j=0; j< outdim1; j++ ){
+ for(IndexType k=0; k< outdim2; k++ ){
+ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+ std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
+ assert(false);
+ }
+ }
+ }
+}
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType indim0 =53;
+ IndexType indim1= 55;
+ IndexType indim2= 51;
+ IndexType outdim0=50;
+ IndexType outdim1=51;
+ IndexType outdim2=49;
+ Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
+ Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
+ Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
+ Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
+
+ Eigen::array<IndexType, 3> dims3{{0,1,2}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+ result_host.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ result_host=input.convolve(kernel, dims3);
+
+for(IndexType i=0; i< outdim0; i++ ){
+ for(IndexType j=0; j< outdim1; j++ ){
+ for(IndexType k=0; k< outdim2; k++ ){
+ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
+ std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
+ assert(false);
+ }
+ }
+ }
+}
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_evals(const Eigen::SyclDevice& sycl_device)
+{
+ Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+ Eigen::array<IndexType, 1> kernel_dims = {{2}};
+ Eigen::array<IndexType, 2> result_dims = {{2, 3}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
+ Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
+
+ Eigen::array<IndexType, 1> dims3{{0}};
+
+ input.setRandom();
+ kernel.setRandom();
+ result.setZero();
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0
+ VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2
+ VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4
+ VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1
+ VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3
+ VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5
+
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr(const Eigen::SyclDevice& sycl_device)
+{
+ Eigen::array<IndexType, 2> input_dims = {{3, 3}};
+ Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
+ Eigen::array<IndexType, 2> result_dims = {{2, 2}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
+ Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
+
+ input.setRandom();
+ kernel.setRandom();
+ Eigen::array<IndexType, 2> dims;
+ dims[0] = 0;
+ dims[1] = 1;
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+ input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+ VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+ input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+ VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+ input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+ VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+ input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_result);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_modes(const Eigen::SyclDevice& sycl_device){
+
+Eigen::array<IndexType, 1> input_dims = {{3}};
+Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+
+input.setRandom();
+kernel.setRandom();
+Eigen::array<IndexType, 1> dims;
+dims[0] = 0;
+
+ input(0) = 1.0f;
+ input(1) = 2.0f;
+ input(2) = 3.0f;
+ kernel(0) = 0.5f;
+ kernel(1) = 1.0f;
+ kernel(2) = 0.0f;
+
+ Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
+
+ // Emulate VALID mode (as defined in
+ // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+ padding[0] = std::make_pair(0, 0);
+ Tensor<DataType, 1, DataLayout, IndexType> valid(1);
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t valid_bytes = valid.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
+
+ VERIFY_IS_EQUAL(valid.dimension(0), 1);
+ VERIFY_IS_APPROX(valid(0), 2.5f);
+
+ // Emulate SAME mode (as defined in
+ // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+ padding[0] = std::make_pair(1, 1);
+ Tensor<DataType, 1, DataLayout, IndexType> same(3);
+ std::size_t same_bytes = same.size() * sizeof(DataType);
+ DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
+ gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
+
+ VERIFY_IS_EQUAL(same.dimension(0), 3);
+ VERIFY_IS_APPROX(same(0), 1.0f);
+ VERIFY_IS_APPROX(same(1), 2.5f);
+ VERIFY_IS_APPROX(same(2), 4.0f);
+
+ // Emulate FULL mode (as defined in
+ // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+ padding[0] = std::make_pair(2, 2);
+
+ Tensor<DataType, 1, DataLayout, IndexType> full(5);
+ std::size_t full_bytes = full.size() * sizeof(DataType);
+ DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
+ gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
+ sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
+
+ VERIFY_IS_EQUAL(full.dimension(0), 5);
+ VERIFY_IS_APPROX(full(0), 0.0f);
+ VERIFY_IS_APPROX(full(1), 1.0f);
+ VERIFY_IS_APPROX(full(2), 2.5f);
+ VERIFY_IS_APPROX(full(3), 4.0f);
+ VERIFY_IS_APPROX(full(4), 1.5f);
+
+ sycl_device.deallocate(d_input);
+ sycl_device.deallocate(d_kernel);
+ sycl_device.deallocate(d_valid);
+ sycl_device.deallocate(d_same);
+ sycl_device.deallocate(d_full);
+
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strides(const Eigen::SyclDevice& sycl_device){
+
+ Eigen::array<IndexType, 1> input_dims = {{13}};
+ Eigen::array<IndexType, 1> kernel_dims = {{3}};
+
+ Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
+ Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
+ Tensor<DataType, 1, DataLayout, IndexType> result(2);
+
+ input.setRandom();
+ kernel.setRandom();
+ Eigen::array<IndexType, 1> dims;
+ dims[0] = 0;
+
+ Eigen::array<IndexType, 1> stride_of_3;
+ stride_of_3[0] = 3;
+ Eigen::array<IndexType, 1> stride_of_2;
+ stride_of_2[0] = 2;
+
+ std::size_t input_bytes = input.size() * sizeof(DataType);
+ std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
+ std::size_t result_bytes = result.size() * sizeof(DataType);
+
+ DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
+ DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
+ DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
+ sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
+ sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
+
+ gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
+ sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
+
+ VERIFY_IS_EQUAL(result.dimension(0), 2);
+ VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+ input(6)*kernel(2)));
+ VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+ input(12)*kernel(2)));
+}
+
+template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
+ QueueInterface queueInterface(s);
+ auto sycl_device=Eigen::SyclDevice(&queueInterface);
+ test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+ test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+ test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+ test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+ test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+ test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+ test_evals<float, ColMajor, int64_t>(sycl_device);
+ test_evals<float, RowMajor, int64_t>(sycl_device);
+ test_expr<float, ColMajor, int64_t>(sycl_device);
+ test_expr<float, RowMajor, int64_t>(sycl_device);
+ test_modes<float, ColMajor, int64_t>(sycl_device);
+ test_modes<float, RowMajor, int64_t>(sycl_device);
+ test_strides<float, ColMajor, int64_t>(sycl_device);
+ test_strides<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorConvolutionPerDevice(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp
index 4528cc176..b5dbc97bd 100644
--- a/unsupported/test/cxx11_tensor_custom_index.cpp
+++ b/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -88,7 +88,7 @@ static void test_sizes_as_index()
}
-void test_cxx11_tensor_custom_index() {
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) {
test_map_as_index<ColMajor>();
test_map_as_index<RowMajor>();
test_matrix_as_index<ColMajor>();
diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp
index 8baa477cc..875ea57d2 100644
--- a/unsupported/test/cxx11_tensor_custom_op.cpp
+++ b/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -104,7 +104,7 @@ static void test_custom_binary_op()
}
-void test_cxx11_tensor_custom_op()
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op)
{
CALL_SUBTEST(test_custom_unary_op());
CALL_SUBTEST(test_custom_binary_op());
diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
new file mode 100644
index 000000000..d947ead83
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp
@@ -0,0 +1,170 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+template<typename TensorType>
+struct InsertZeros {
+ DSizes<DenseIndex, 2> dimensions(const TensorType& input) const {
+ DSizes<DenseIndex, 2> result;
+ result[0] = input.dimension(0) * 2;
+ result[1] = input.dimension(1) * 2;
+ return result;
+ }
+
+ template <typename Output, typename Device>
+ void eval(const TensorType& input, Output& output, const Device& device) const
+ {
+ array<DenseIndex, 2> strides;
+ strides[0] = 2;
+ strides[1] = 2;
+ output.stride(strides).device(device) = input;
+
+ Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+ Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+ output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+ }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+ IndexType sizeDim1 = 3;
+ IndexType sizeDim2 = 5;
+ Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+ Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}};
+
+ Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange);
+ Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange);
+
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+ typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType;
+ TensorType gpu_in1(gpu_in1_data, tensorRange);
+ TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+ in1.setRandom();
+ sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>());
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(out.dimension(0), 6);
+ VERIFY_IS_EQUAL(out.dimension(1), 10);
+
+ for (int i = 0; i < 6; i+=2) {
+ for (int j = 0; j < 10; j+=2) {
+ VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2));
+ }
+ }
+ for (int i = 1; i < 6; i+=2) {
+ for (int j = 1; j < 10; j+=2) {
+ VERIFY_IS_EQUAL(out(i, j), 0);
+ }
+ }
+ sycl_device.deallocate(gpu_in1_data);
+sycl_device.deallocate(gpu_out_data);
+}
+
+template<typename TensorType>
+struct BatchMatMul {
+ DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const {
+ DSizes<DenseIndex, 3> result;
+ result[0] = input1.dimension(0);
+ result[1] = input2.dimension(1);
+ result[2] = input2.dimension(2);
+ return result;
+ }
+
+ template <typename Output, typename Device>
+ void eval(const TensorType& input1, const TensorType& input2,
+ Output& output, const Device& device) const
+ {
+ typedef typename TensorType::DimensionPair DimPair;
+ array<DimPair, 1> dims;
+ dims[0] = DimPair(1, 0);
+ for (int64_t i = 0; i < output.dimension(2); ++i) {
+ output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims);
+ }
+ }
+};
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+ Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}};
+ Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}};
+ Eigen::array<IndexType, 3> tensorResultRange = {{2, 7, 5}};
+
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange);
+
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+
+ typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType;
+ TensorType gpu_in1(gpu_in1_data, tensorRange1);
+ TensorType gpu_in2(gpu_in2_data, tensorRange2);
+ TensorType gpu_out(gpu_out_data, tensorResultRange);
+
+ in1.setRandom();
+ in2.setRandom();
+
+ sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
+
+ gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>());
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+
+ for (IndexType i = 0; i < 5; ++i) {
+ typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair;
+ array<DimPair, 1> dims;
+ dims[0] = DimPair(1, 0);
+ Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims);
+ TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i);
+ for (IndexType j = 0; j < 2; ++j) {
+ for (IndexType k = 0; k < 7; ++k) {
+ VERIFY_IS_APPROX(val(j, k), reference(j, k));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_in1_data);
+ sycl_device.deallocate(gpu_in2_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(custom_op_perDevice<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index fde20ddf2..c9f78d2d3 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -9,16 +9,15 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device
+
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
using Eigen::Tensor;
using Eigen::RowMajor;
@@ -68,22 +67,22 @@ struct CPUContext {
// Context for evaluation on GPU
struct GPUContext {
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
- assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+ assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
float kernel_1d_val[] = {3.14f, 2.7f};
- assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+ assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
- assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+ assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
- assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+ assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
- assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+ assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
- assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+ assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
}
~GPUContext() {
- assert(cudaFree(kernel_1d_) == cudaSuccess);
- assert(cudaFree(kernel_2d_) == cudaSuccess);
- assert(cudaFree(kernel_3d_) == cudaSuccess);
+ assert(gpuFree(kernel_1d_) == gpuSuccess);
+ assert(gpuFree(kernel_2d_) == gpuSuccess);
+ assert(gpuFree(kernel_3d_) == gpuSuccess);
}
const Eigen::GpuDevice& device() const { return gpu_device_; }
@@ -104,7 +103,7 @@ struct GPUContext {
float* kernel_2d_;
float* kernel_3d_;
- Eigen::CudaStreamDevice stream_;
+ Eigen::GpuStreamDevice stream_;
Eigen::GpuDevice gpu_device_;
};
@@ -283,12 +282,12 @@ void test_gpu() {
float* d_in1;
float* d_in2;
float* d_out;
- cudaMalloc((void**)(&d_in1), in1_bytes);
- cudaMalloc((void**)(&d_in2), in2_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in1), in1_bytes);
+ gpuMalloc((void**)(&d_in2), in2_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@@ -296,7 +295,7 @@ void test_gpu() {
GPUContext context(gpu_in1, gpu_in2, gpu_out);
test_contextual_eval(&context);
- assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+ assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
@@ -306,7 +305,7 @@ void test_gpu() {
}
test_forced_contextual_eval(&context);
- assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+ assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
@@ -316,7 +315,7 @@ void test_gpu() {
}
test_compound_assignment(&context);
- assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+ assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
@@ -326,7 +325,7 @@ void test_gpu() {
}
test_contraction(&context);
- assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+ assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0);
@@ -341,8 +340,8 @@ void test_gpu() {
}
test_1d_convolution(&context);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) {
@@ -352,8 +351,8 @@ void test_gpu() {
}
test_2d_convolution(&context);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
@@ -365,9 +364,13 @@ void test_gpu() {
}
}
+#if !defined(EIGEN_USE_HIP)
+// disable this test on the HIP platform
+// 3D tensor convolutions seem to hang on the HIP platform
+
test_3d_convolution(&context);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
@@ -380,10 +383,13 @@ void test_gpu() {
}
}
}
+
+#endif
+
}
-void test_cxx11_tensor_device()
+EIGEN_DECLARE_TEST(cxx11_tensor_device)
{
CALL_SUBTEST_1(test_cpu());
CALL_SUBTEST_2(test_gpu());
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 7f79753c5..5095cb078 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -13,19 +13,65 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <stdint.h>
+#include <iostream>
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_memory(const Eigen::SyclDevice &sycl_device) {
+ std::cout << "Running on : "
+ << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
+ <<std::endl;
+ IndexType sizeDim1 = 100;
+ array<IndexType, 1> tensorRange = {{sizeDim1}};
+ Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+ Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
+ memset(in1.data(), 1, in1.size() * sizeof(DataType));
+ DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+ sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
+ for (IndexType i=0; i<in.size(); i++) {
+ VERIFY_IS_EQUAL(in(i), in1(i));
+ }
+ sycl_device.deallocate(gpu_in_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
+ VERIFY(sycl_device.ok());
+ IndexType sizeDim1 = 100;
+ array<IndexType, 1> tensorDims = {{sizeDim1}};
+ DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
+ sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
-void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
- std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
- << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
+ out.device(sycl_device) = in / in.constant(0);
+
+ sycl_device.synchronize();
+ VERIFY(!sycl_device.ok());
+ sycl_device.deallocate(gpu_data);
+}
+
+template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
+ std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
+ QueueInterface queueInterface(d);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+ test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
+ /// this test throw an exception. enable it if you want to see the exception
+ //test_device_exceptions<DataType, RowMajor>(sycl_device);
+ /// this test throw an exception. enable it if you want to see the exception
+ //test_device_exceptions<DataType, ColMajor>(sycl_device);
}
-void test_cxx11_tensor_device_sycl() {
- cl::sycl::gpu_selector s;
- Eigen::SyclDevice sycl_device(s);
- CALL_SUBTEST(test_device_sycl(sycl_device));
+
+EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_device_test_per_device<float>(device));
+ }
}
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 16f168ed4..ee416e14a 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -60,10 +60,29 @@ static void test_rank_zero()
VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
}
-void test_cxx11_tensor_dimension()
+static void test_index_type_promotion() {
+ Eigen::DSizes<int, 3> src0(1, 2, 3);
+ Eigen::array<int, 3> src1;
+ src1[0] = 4;
+ src1[1] = 5;
+ src1[2] = 6;
+
+ Eigen::DSizes<long, 3> dst0(src0);
+ Eigen::DSizes<long, 3> dst1(src1);
+
+ VERIFY_IS_EQUAL(dst0[0], 1L);
+ VERIFY_IS_EQUAL(dst0[1], 2L);
+ VERIFY_IS_EQUAL(dst0[2], 3L);
+ VERIFY_IS_EQUAL(dst1[0], 4L);
+ VERIFY_IS_EQUAL(dst1[1], 5L);
+ VERIFY_IS_EQUAL(dst1[2], 6L);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_dimension)
{
CALL_SUBTEST(test_dynamic_size());
CALL_SUBTEST(test_fixed_size());
CALL_SUBTEST(test_match());
CALL_SUBTEST(test_rank_zero());
+ CALL_SUBTEST(test_index_type_promotion());
}
diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp
index d7eea42d7..fd889c46c 100644
--- a/unsupported/test/cxx11_tensor_empty.cpp
+++ b/unsupported/test/cxx11_tensor_empty.cpp
@@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor()
}
-void test_cxx11_tensor_empty()
+EIGEN_DECLARE_TEST(cxx11_tensor_empty)
{
CALL_SUBTEST(test_empty_tensor());
CALL_SUBTEST(test_empty_fixed_size_tensor());
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 000000000..66b06e8ee
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled evaluation.
+
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+ using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+ using Executor =
+ Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+ /*Vectorizable=*/false,
+ /*Tiling=*/TiledEvaluation::Off>;
+
+ Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+ typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+ using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+ using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+ Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+}
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+ array<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<int>(min_dim, max_dim);
+ }
+ return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_unary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> src(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ src.setRandom();
+ const auto expr = src.square();
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T square = src.coeff(i) * src.coeff(i);
+ VERIFY_IS_EQUAL(square, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> lhs(dims);
+ Tensor<T, NumDims, Options, Index> rhs(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ lhs.setRandom();
+ rhs.setRandom();
+
+ const auto expr = lhs + rhs;
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T sum = lhs.coeff(i) + rhs.coeff(i);
+ VERIFY_IS_EQUAL(sum, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto broadcasts = RandomDims<NumDims>(1, 7);
+ const auto expr = src.broadcast(broadcasts);
+
+ // We assume that broadcasting on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_rvalue(Device d)
+{
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Layout, Index> src(dims);
+ src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM) \
+ if (NumDims > (CHIP_DIM)) { \
+ const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+ const auto expr = src.template chip<(CHIP_DIM)>(offset); \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> golden; \
+ golden = expr; \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
+ \
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
+ using Executor = internal::TensorExecutor<const Assign, Device, \
+ Vectorizable, Tiling>; \
+ \
+ Executor::run(Assign(dst, expr), d); \
+ \
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
+ } \
+ }
+
+ TEST_CHIPPING(0)
+ TEST_CHIPPING(1)
+ TEST_CHIPPING(2)
+ TEST_CHIPPING(3)
+ TEST_CHIPPING(4)
+ TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_lvalue(Device d)
+{
+ auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM) \
+ if (NumDims > (CHIP_DIM)) { \
+ /* Generate random data that we'll assign to the chipped tensor dim. */ \
+ array<Index, NumDims - 1> src_dims; \
+ for (int i = 0; i < NumDims - 1; ++i) { \
+ int dim = i < (CHIP_DIM) ? i : i + 1; \
+ src_dims[i] = dims[dim]; \
+ } \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
+ src.setRandom(); \
+ \
+ const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+ \
+ Tensor<T, NumDims, Layout, Index> random(dims); \
+ random.setZero(); \
+ \
+ Tensor<T, NumDims, Layout, Index> golden(dims); \
+ golden = random; \
+ golden.template chip<(CHIP_DIM)>(offset) = src; \
+ \
+ Tensor<T, NumDims, Layout, Index> dst(dims); \
+ dst = random; \
+ auto expr = dst.template chip<(CHIP_DIM)>(offset); \
+ \
+ using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
+ using Executor = internal::TensorExecutor<const Assign, Device, \
+ Vectorizable, Tiling>; \
+ \
+ Executor::run(Assign(expr, src), d); \
+ \
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
+ } \
+ }
+
+ TEST_CHIPPING(0)
+ TEST_CHIPPING(1)
+ TEST_CHIPPING(2)
+ TEST_CHIPPING(3)
+ TEST_CHIPPING(4)
+ TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_rvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ DSizes<Index, NumDims> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+ // Test all possible shuffle permutations.
+ do {
+ DSizes<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ shuffled_dims[i] = dims[shuffle[i]];
+ }
+
+ const auto expr = src.shuffle(shuffle);
+
+ // We assume that shuffling on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+ DefaultAssign(golden, expr);
+
+ // Now do the shuffling using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+ DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+
+ } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_lvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ DSizes<Index, NumDims> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+ // Test all possible shuffle permutations.
+ do {
+ DSizes<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+ // We assume that shuffling on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+ auto golden_shuffle = golden.shuffle(shuffle);
+ DefaultAssign(golden_shuffle, src);
+
+ // Now do the shuffling using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+ auto dst_shuffle = dst.shuffle(shuffle);
+ DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+
+ } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_reshape(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+ static constexpr int ReshapedDims = NumDims - 1;
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Multiple 0th dimension and then shuffle.
+ std::vector<Index> shuffle;
+ for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+ std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+ DSizes<Index, ReshapedDims> reshaped_dims;
+ reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+ for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+ Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+ // Now reshape using configured tensor executor.
+ Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.reshape(reshaped_dims);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+ auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ Tensor<T, NumDims, Options, Index> golden =
+ src.slice(slice_start, slice_size);
+
+ // Now reshape using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.slice(slice_start, slice_size);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+ auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ Tensor<T, NumDims, Options, Index> slice(slice_size);
+ slice.setRandom();
+
+ // Assign a slice using default executor.
+ Tensor<T, NumDims, Options, Index> golden = src;
+ golden.slice(slice_start, slice_size) = slice;
+
+ // And using configured execution strategy.
+ Tensor<T, NumDims, Options, Index> dst = src;
+ auto expr = dst.slice(slice_start, slice_size);
+
+ using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(expr, slice), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto broadcasts = RandomDims<NumDims>(1, 7);
+ const auto expr = src.square().eval().broadcast(broadcasts);
+
+ // We assume that broadcasting on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template<typename T, int NumDims>
+struct DummyGenerator {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ T operator()(const array <Index, NumDims>& dims) const {
+ T result = static_cast<T>(0);
+ for (int i = 0; i < NumDims; ++i) {
+ result += static_cast<T>((i + 1) * dims[i]);
+ }
+ return result;
+ }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_generator_op(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(20, 30);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto expr = src.generate(DummyGenerator<T, NumDims>());
+
+ // We assume that generator on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+ Tensor <T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Reverse half of the dimensions.
+ Eigen::array<bool, NumDims> reverse;
+ for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+ const auto expr = src.reverse(reverse);
+
+ // We assume that reversing on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor <T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the reversing using configured tensor executor.
+ Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> src(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ src.setRandom();
+ const auto expr = src.square();
+
+ Eigen::Barrier done(1);
+ auto on_done = [&done]() { done.Notify(); };
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using DoneCallback = decltype(on_done);
+ using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+ Vectorizable, Tiling>;
+
+ Executor::runAsync(Assign(dst, expr), d, on_done);
+ done.Wait();
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T square = src.coeff(i) * src.coeff(i);
+ VERIFY_IS_EQUAL(square, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> lhs(dims);
+ Tensor<T, NumDims, Options, Index> rhs(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ lhs.setRandom();
+ rhs.setRandom();
+
+ const auto expr = lhs + rhs;
+
+ Eigen::Barrier done(1);
+ auto on_done = [&done]() { done.Notify(); };
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using DoneCallback = decltype(on_done);
+ using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+ Vectorizable, Tiling>;
+
+ Executor::runAsync(Assign(dst, expr), d, on_done);
+ done.Wait();
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T sum = lhs.coeff(i) + rhs.coeff(i);
+ VERIFY_IS_EQUAL(sum, dst.coeff(i));
+ }
+}
+
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
+#define CALL_SUBTEST_PART(PART) \
+ CALL_SUBTEST_##PART
+
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
+
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+ Eigen::DefaultDevice default_device;
+ // Default device is unused in ASYNC tests.
+ EIGEN_UNUSED_VARIABLE(default_device);
+
+ const auto num_threads = internal::random<int>(20, 24);
+ Eigen::ThreadPool tp(num_threads);
+ Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
+ CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+ CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
+ // Force CMake to split this test.
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
+}
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 77e24cb67..169fc1898 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -7,6 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <numeric>
+
#include "main.h"
#include <Eigen/CXX11/Tensor>
@@ -193,26 +195,23 @@ static void test_constants()
static void test_boolean()
{
- Tensor<int, 1> vec(6);
- std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+ const int kSize = 31;
+ Tensor<int, 1> vec(kSize);
+ std::iota(vec.data(), vec.data() + kSize, 0);
// Test ||.
Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
- VERIFY_IS_EQUAL(bool1[0], true);
- VERIFY_IS_EQUAL(bool1[1], false);
- VERIFY_IS_EQUAL(bool1[2], false);
- VERIFY_IS_EQUAL(bool1[3], false);
- VERIFY_IS_EQUAL(bool1[4], false);
- VERIFY_IS_EQUAL(bool1[5], true);
+ for (int i = 0; i < kSize; ++i) {
+ bool expected = i < 1 || i > 4;
+ VERIFY_IS_EQUAL(bool1[i], expected);
+ }
// Test &&, including cast of operand vec.
Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
- VERIFY_IS_EQUAL(bool2[0], false);
- VERIFY_IS_EQUAL(bool2[1], true);
- VERIFY_IS_EQUAL(bool2[2], true);
- VERIFY_IS_EQUAL(bool2[3], true);
- VERIFY_IS_EQUAL(bool2[4], false);
- VERIFY_IS_EQUAL(bool2[5], false);
+ for (int i = 0; i < kSize; ++i) {
+ bool expected = bool(i) && i < 4;
+ VERIFY_IS_EQUAL(bool2[i], expected);
+ }
// Compilation tests:
// Test Tensor<bool> against results of cast or comparison; verifies that
@@ -300,8 +299,152 @@ static void test_select()
}
}
+template <typename Scalar>
+void test_minmax_nan_propagation_templ() {
+ for (int size = 1; size < 17; ++size) {
+ const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN();
+ const Scalar kInf = std::numeric_limits<Scalar>::infinity();
+ const Scalar kZero(0);
+ Tensor<Scalar, 1> vec_all_nan(size);
+ Tensor<Scalar, 1> vec_one_nan(size);
+ Tensor<Scalar, 1> vec_zero(size);
+ vec_all_nan.setConstant(kNaN);
+ vec_zero.setZero();
+ vec_one_nan.setZero();
+ vec_one_nan(size/2) = kNaN;
+
+ auto verify_all_nan = [&](const Tensor<Scalar, 1>& v) {
+ for (int i = 0; i < size; ++i) {
+ VERIFY((numext::isnan)(v(i)));
+ }
+ };
+
+ auto verify_all_zero = [&](const Tensor<Scalar, 1>& v) {
+ for (int i = 0; i < size; ++i) {
+ VERIFY_IS_EQUAL(v(i), Scalar(0));
+ }
+ };
+
+ // Test NaN propagating max.
+ // max(nan, nan) = nan
+ // max(nan, 0) = nan
+ // max(0, nan) = nan
+ // max(0, 0) = 0
+ verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kNaN));
+ verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_all_nan));
+ verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kZero));
+ verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_zero));
+ verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN));
+ verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_all_nan));
+ verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero));
+ verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero));
+
+ // Test number propagating max.
+ // max(nan, nan) = nan
+ // max(nan, 0) = 0
+ // max(0, nan) = 0
+ // max(0, 0) = 0
+ verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(kNaN));
+ verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_all_nan));
+ verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(kZero));
+ verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_zero));
+ verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN));
+ verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_all_nan));
+ verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero));
+ verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero));
+
+ // Test NaN propagating min.
+ // min(nan, nan) = nan
+ // min(nan, 0) = nan
+ // min(0, nan) = nan
+ // min(0, 0) = 0
+ verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kNaN));
+ verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_all_nan));
+ verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kZero));
+ verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_zero));
+ verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN));
+ verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_all_nan));
+ verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero));
+ verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero));
+
+ // Test number propagating min.
+ // min(nan, nan) = nan
+ // min(nan, 0) = 0
+ // min(0, nan) = 0
+ // min(0, 0) = 0
+ verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(kNaN));
+ verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_all_nan));
+ verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(kZero));
+ verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_zero));
+ verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN));
+ verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_all_nan));
+ verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero));
+ verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero));
+
+ // Test min and max reduction
+ Tensor<Scalar, 0> val;
+ val = vec_zero.minimum();
+ VERIFY_IS_EQUAL(val(), kZero);
+ val = vec_zero.template minimum<PropagateNaN>();
+ VERIFY_IS_EQUAL(val(), kZero);
+ val = vec_zero.template minimum<PropagateNumbers>();
+ VERIFY_IS_EQUAL(val(), kZero);
+ val = vec_zero.maximum();
+ VERIFY_IS_EQUAL(val(), kZero);
+ val = vec_zero.template maximum<PropagateNaN>();
+ VERIFY_IS_EQUAL(val(), kZero);
+ val = vec_zero.template maximum<PropagateNumbers>();
+ VERIFY_IS_EQUAL(val(), kZero);
+
+ // Test NaN propagation for tensor of all NaNs.
+ val = vec_all_nan.template minimum<PropagateNaN>();
+ VERIFY((numext::isnan)(val()));
+ val = vec_all_nan.template minimum<PropagateNumbers>();
+ VERIFY_IS_EQUAL(val(), kInf);
+ val = vec_all_nan.template maximum<PropagateNaN>();
+ VERIFY((numext::isnan)(val()));
+ val = vec_all_nan.template maximum<PropagateNumbers>();
+ VERIFY_IS_EQUAL(val(), -kInf);
+
+ // Test NaN propagation for tensor with a single NaN.
+ val = vec_one_nan.template minimum<PropagateNaN>();
+ VERIFY((numext::isnan)(val()));
+ val = vec_one_nan.template minimum<PropagateNumbers>();
+ VERIFY_IS_EQUAL(val(), (size == 1 ? kInf : kZero));
+ val = vec_one_nan.template maximum<PropagateNaN>();
+ VERIFY((numext::isnan)(val()));
+ val = vec_one_nan.template maximum<PropagateNumbers>();
+ VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero));
+ }
+}
+
+static void test_clip()
+{
+ Tensor<float, 1> vec(6);
+ vec(0) = 4.0;
+ vec(1) = 8.0;
+ vec(2) = 15.0;
+ vec(3) = 16.0;
+ vec(4) = 23.0;
+ vec(5) = 42.0;
+
+ float kMin = 20;
+ float kMax = 30;
+
+ Tensor<float, 1> vec_clipped(6);
+ vec_clipped = vec.clip(kMin, kMax);
+ for (int i = 0; i < 6; ++i) {
+ VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax));
+ }
+}
+
+static void test_minmax_nan_propagation()
+{
+ test_minmax_nan_propagation_templ<float>();
+ test_minmax_nan_propagation_templ<double>();
+}
-void test_cxx11_tensor_expr()
+EIGEN_DECLARE_TEST(cxx11_tensor_expr)
{
CALL_SUBTEST(test_1d());
CALL_SUBTEST(test_2d());
@@ -311,4 +454,11 @@ void test_cxx11_tensor_expr()
CALL_SUBTEST(test_functors());
CALL_SUBTEST(test_type_casting());
CALL_SUBTEST(test_select());
+ CALL_SUBTEST(test_clip());
+
+// Nan propagation does currently not work like one would expect from std::max/std::min,
+// so we disable it for now
+#if !EIGEN_ARCH_ARM_OR_ARM64
+ CALL_SUBTEST(test_minmax_nan_propagation());
+#endif
}
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp
index 2f14ebc62..2e1008eca 100644
--- a/unsupported/test/cxx11_tensor_fft.cpp
+++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -224,7 +224,35 @@ static void test_fft_real_input_energy() {
}
}
-void test_cxx11_tensor_fft() {
+template <typename RealScalar>
+static void test_fft_non_power_of_2_round_trip(int exponent) {
+ int n = (1 << exponent) + 1;
+
+ Eigen::DSizes<ptrdiff_t, 1> dimensions;
+ dimensions[0] = n;
+ const DSizes<ptrdiff_t, 1> arr = dimensions;
+ Tensor<RealScalar, 1, ColMajor, ptrdiff_t> input;
+
+ input.resize(arr);
+ input.setRandom();
+
+ array<int, 1> fft;
+ fft[0] = 0;
+
+ Tensor<std::complex<RealScalar>, 1, ColMajor> forward =
+ input.template fft<BothParts, FFT_FORWARD>(fft);
+
+ Tensor<RealScalar, 1, ColMajor, ptrdiff_t> output =
+ forward.template fft<RealPart, FFT_REVERSE>(fft);
+
+ for (int i = 0; i < n; ++i) {
+ RealScalar tol = test_precision<RealScalar>() *
+ (std::abs(input[i]) + std::abs(output[i]) + 1);
+ VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol);
+ }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_fft) {
test_fft_complex_input_golden();
test_fft_real_input_golden();
@@ -270,4 +298,7 @@ void test_cxx11_tensor_fft() {
test_fft_real_input_energy<RowMajor, double, true, Eigen::BothParts, FFT_FORWARD, 4>();
test_fft_real_input_energy<RowMajor, float, false, Eigen::BothParts, FFT_FORWARD, 4>();
test_fft_real_input_energy<RowMajor, double, false, Eigen::BothParts, FFT_FORWARD, 4>();
+
+ test_fft_non_power_of_2_round_trip<float>(7);
+ test_fft_non_power_of_2_round_trip<double>(7);
}
diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp
index 4c660de65..456ce6bea 100644
--- a/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -21,7 +21,7 @@ static void test_0d()
TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
VERIFY_IS_EQUAL(scalar1.rank(), 0);
VERIFY_IS_EQUAL(scalar1.size(), 1);
- VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
+ VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1);
scalar1() = 7.0;
scalar2() = 13.0;
@@ -250,7 +250,7 @@ static void test_array()
}
}
-void test_cxx11_tensor_fixed_size()
+EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size)
{
CALL_SUBTEST(test_0d());
CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp
index 45d7345e9..a21a02bec 100644
--- a/unsupported/test/cxx11_tensor_forced_eval.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -61,7 +61,7 @@ static void test_const()
Eigen::array<int, 2> bcast;
bcast[0] = 3;
bcast[1] = 1;
- const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
+ const TensorMap<const Tensor<float, 2> > input_tensor(input.data(), 3, 3);
Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
for (int i = 0; i < 3; ++i) {
@@ -72,7 +72,7 @@ static void test_const()
}
-void test_cxx11_tensor_forced_eval()
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval)
{
CALL_SUBTEST(test_simple());
CALL_SUBTEST(test_const());
diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index 5690da723..a55a5ad8a 100644
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -13,44 +13,44 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
-
+template <typename DataType, int DataLayout, typename IndexType>
void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 100;
- int sizeDim2 = 200;
- int sizeDim3 = 200;
- Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Eigen::Tensor<float, 3> in1(tensorRange);
- Eigen::Tensor<float, 3> in2(tensorRange);
- Eigen::Tensor<float, 3> out(tensorRange);
+ IndexType sizeDim1 = 100;
+ IndexType sizeDim2 = 20;
+ IndexType sizeDim3 = 20;
+ Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+ Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
- float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
- float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
- float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
- in1 = in1.random() + in1.constant(10.0f);
- in2 = in2.random() + in2.constant(10.0f);
+ in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f));
+ in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f));
// creating TensorMap from tensor
- Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
- Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
- Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
- sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+ sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType));
/// c=(a+b)*b
gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i, j, k),
(in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
}
@@ -63,8 +63,15 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
}
-void test_cxx11_tensor_forced_eval_sycl() {
- cl::sycl::gpu_selector s;
- Eigen::SyclDevice sycl_device(s);
- CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
+template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
+ CALL_SUBTEST(tensorForced_evalperDevice<half>(device));
+ }
}
diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp
index dcb928714..6dcf676bb 100644
--- a/unsupported/test/cxx11_tensor_generator.cpp
+++ b/unsupported/test/cxx11_tensor_generator.cpp
@@ -42,11 +42,11 @@ struct Generator2D {
template <int DataLayout>
static void test_2D()
{
- Tensor<float, 2> matrix(5, 7);
+ Tensor<float, 2> matrix(512, 512);
Tensor<float, 2> result = matrix.generate(Generator2D());
- for (int i = 0; i < 5; ++i) {
- for (int j = 0; j < 5; ++j) {
+ for (int i = 0; i < 512; ++i) {
+ for (int j = 0; j < 512; ++j) {
VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
}
}
@@ -80,7 +80,7 @@ static void test_gaussian()
}
-void test_cxx11_tensor_generator()
+EIGEN_DECLARE_TEST(cxx11_tensor_generator)
{
CALL_SUBTEST(test_1D<ColMajor>());
CALL_SUBTEST(test_1D<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp
new file mode 100644
index 000000000..fb6e3d9d0
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+static const float error_threshold =1e-8f;
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+struct Generator1D {
+ Generator1D() { }
+
+ float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+ return coordinates[0];
+ }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_1D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType sizeDim1 = 6;
+ array<IndexType, 1> tensorRange = {{sizeDim1}};
+ Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange);
+ Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange);
+
+ const size_t tensorBuffSize =vec.size()*sizeof(DataType);
+ DataType* gpu_data_vec = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange);
+ TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize);
+ gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D());
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+ for (IndexType i = 0; i < 6; ++i) {
+ VERIFY_IS_EQUAL(result(i), i);
+ }
+}
+
+
+struct Generator2D {
+ Generator2D() { }
+
+ float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+ return 3 * coordinates[0] + 11 * coordinates[1];
+ }
+};
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_2D_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 5;
+ IndexType sizeDim2 = 7;
+ array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+ Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+ Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+ const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+ DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+ gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D());
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+ for (IndexType i = 0; i < 5; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+ }
+ }
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType rows = 32;
+ IndexType cols = 48;
+ array<DataType, 2> means;
+ means[0] = rows / 2.0f;
+ means[1] = cols / 2.0f;
+ array<DataType, 2> std_devs;
+ std_devs[0] = 3.14f;
+ std_devs[1] = 2.7f;
+ internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+ array<IndexType, 2> tensorRange = {{rows, cols}};
+ Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange);
+ Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange);
+
+ const size_t tensorBuffSize =matrix.size()*sizeof(DataType);
+ DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize);
+ gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen);
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize);
+
+ for (IndexType i = 0; i < rows; ++i) {
+ for (IndexType j = 0; j < cols; ++j) {
+ DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+ DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+ DataType gaussian = expf(-g_rows - g_cols);
+ Eigen::internal::isApprox(result(i, j), gaussian, error_threshold);
+ }
+ }
+}
+
+template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_generator_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 0ba9d52e9..137d0d596 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -9,18 +9,19 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
+#define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11
+
using Eigen::Tensor;
-void test_cuda_nullary() {
+void test_gpu_nullary() {
Tensor<float, 1, 0, int> in1(2);
Tensor<float, 1, 0, int> in2(2);
in1.setRandom();
@@ -30,12 +31,12 @@ void test_cuda_nullary() {
float* d_in1;
float* d_in2;
- cudaMalloc((void**)(&d_in1), tensor_bytes);
- cudaMalloc((void**)(&d_in2), tensor_bytes);
- cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+ gpuMalloc((void**)(&d_in1), tensor_bytes);
+ gpuMalloc((void**)(&d_in2), tensor_bytes);
+ gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
@@ -49,23 +50,23 @@ void test_cuda_nullary() {
Tensor<float, 1, 0, int> new1(2);
Tensor<float, 1, 0, int> new2(2);
- assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
- gpu_device.stream()) == cudaSuccess);
- assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
- gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(new1(i), 3.14f);
VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
}
- cudaFree(d_in1);
- cudaFree(d_in2);
+ gpuFree(d_in1);
+ gpuFree(d_in2);
}
-void test_cuda_elementwise_small() {
+void test_gpu_elementwise_small() {
Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -79,14 +80,14 @@ void test_cuda_elementwise_small() {
float* d_in1;
float* d_in2;
float* d_out;
- cudaMalloc((void**)(&d_in1), in1_bytes);
- cudaMalloc((void**)(&d_in2), in2_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in1), in1_bytes);
+ gpuMalloc((void**)(&d_in2), in2_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -98,9 +99,9 @@ void test_cuda_elementwise_small() {
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
- gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(
@@ -108,12 +109,12 @@ void test_cuda_elementwise_small() {
in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
}
- cudaFree(d_in1);
- cudaFree(d_in2);
- cudaFree(d_out);
+ gpuFree(d_in1);
+ gpuFree(d_in2);
+ gpuFree(d_out);
}
-void test_cuda_elementwise()
+void test_gpu_elementwise()
{
Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -132,16 +133,16 @@ void test_cuda_elementwise()
float* d_in2;
float* d_in3;
float* d_out;
- cudaMalloc((void**)(&d_in1), in1_bytes);
- cudaMalloc((void**)(&d_in2), in2_bytes);
- cudaMalloc((void**)(&d_in3), in3_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in1), in1_bytes);
+ gpuMalloc((void**)(&d_in2), in2_bytes);
+ gpuMalloc((void**)(&d_in3), in3_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
@@ -151,8 +152,8 @@ void test_cuda_elementwise()
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) {
@@ -162,13 +163,13 @@ void test_cuda_elementwise()
}
}
- cudaFree(d_in1);
- cudaFree(d_in2);
- cudaFree(d_in3);
- cudaFree(d_out);
+ gpuFree(d_in1);
+ gpuFree(d_in2);
+ gpuFree(d_in3);
+ gpuFree(d_out);
}
-void test_cuda_props() {
+void test_gpu_props() {
Tensor<float, 1> in1(200);
Tensor<bool, 1> out(200);
in1.setRandom();
@@ -178,12 +179,12 @@ void test_cuda_props() {
float* d_in1;
bool* d_out;
- cudaMalloc((void**)(&d_in1), in1_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in1), in1_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
@@ -193,19 +194,19 @@ void test_cuda_props() {
gpu_out.device(gpu_device) = (gpu_in1.isnan)();
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
- gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 200; ++i) {
VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
}
- cudaFree(d_in1);
- cudaFree(d_out);
+ gpuFree(d_in1);
+ gpuFree(d_out);
}
-void test_cuda_reduction()
+void test_gpu_reduction()
{
Tensor<float, 4> in1(72,53,97,113);
Tensor<float, 2> out(72,97);
@@ -216,12 +217,12 @@ void test_cuda_reduction()
float* d_in1;
float* d_out;
- cudaMalloc((void**)(&d_in1), in1_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_in1), in1_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
@@ -233,8 +234,8 @@ void test_cuda_reduction()
gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
@@ -249,12 +250,12 @@ void test_cuda_reduction()
}
}
- cudaFree(d_in1);
- cudaFree(d_out);
+ gpuFree(d_in1);
+ gpuFree(d_out);
}
template<int DataLayout>
-void test_cuda_contraction()
+void test_gpu_contraction()
{
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
@@ -274,14 +275,14 @@ void test_cuda_contraction()
float* d_t_right;
float* d_t_result;
- cudaMalloc((void**)(&d_t_left), t_left_bytes);
- cudaMalloc((void**)(&d_t_right), t_right_bytes);
- cudaMalloc((void**)(&d_t_result), t_result_bytes);
+ gpuMalloc((void**)(&d_t_left), t_left_bytes);
+ gpuMalloc((void**)(&d_t_right), t_right_bytes);
+ gpuMalloc((void**)(&d_t_result), t_result_bytes);
- cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
@@ -301,7 +302,7 @@ void test_cuda_contraction()
m_result = m_left * m_right;
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
- cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
@@ -310,13 +311,13 @@ void test_cuda_contraction()
}
}
- cudaFree(d_t_left);
- cudaFree(d_t_right);
- cudaFree(d_t_result);
+ gpuFree(d_t_left);
+ gpuFree(d_t_right);
+ gpuFree(d_t_result);
}
template<int DataLayout>
-void test_cuda_convolution_1d()
+void test_gpu_convolution_1d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 1, DataLayout> kernel(4);
@@ -331,14 +332,14 @@ void test_cuda_convolution_1d()
float* d_input;
float* d_kernel;
float* d_out;
- cudaMalloc((void**)(&d_input), input_bytes);
- cudaMalloc((void**)(&d_kernel), kernel_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_input), input_bytes);
+ gpuMalloc((void**)(&d_kernel), kernel_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
@@ -348,8 +349,8 @@ void test_cuda_convolution_1d()
Eigen::array<Eigen::DenseIndex, 1> dims(1);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 34; ++j) {
@@ -364,12 +365,12 @@ void test_cuda_convolution_1d()
}
}
- cudaFree(d_input);
- cudaFree(d_kernel);
- cudaFree(d_out);
+ gpuFree(d_input);
+ gpuFree(d_kernel);
+ gpuFree(d_out);
}
-void test_cuda_convolution_inner_dim_col_major_1d()
+void test_gpu_convolution_inner_dim_col_major_1d()
{
Tensor<float, 4, ColMajor> input(74,9,11,7);
Tensor<float, 1, ColMajor> kernel(4);
@@ -384,14 +385,14 @@ void test_cuda_convolution_inner_dim_col_major_1d()
float* d_input;
float* d_kernel;
float* d_out;
- cudaMalloc((void**)(&d_input), input_bytes);
- cudaMalloc((void**)(&d_kernel), kernel_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_input), input_bytes);
+ gpuMalloc((void**)(&d_kernel), kernel_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
@@ -401,8 +402,8 @@ void test_cuda_convolution_inner_dim_col_major_1d()
Eigen::array<Eigen::DenseIndex, 1> dims(0);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 71; ++i) {
for (int j = 0; j < 9; ++j) {
@@ -417,12 +418,12 @@ void test_cuda_convolution_inner_dim_col_major_1d()
}
}
- cudaFree(d_input);
- cudaFree(d_kernel);
- cudaFree(d_out);
+ gpuFree(d_input);
+ gpuFree(d_kernel);
+ gpuFree(d_out);
}
-void test_cuda_convolution_inner_dim_row_major_1d()
+void test_gpu_convolution_inner_dim_row_major_1d()
{
Tensor<float, 4, RowMajor> input(7,9,11,74);
Tensor<float, 1, RowMajor> kernel(4);
@@ -437,14 +438,14 @@ void test_cuda_convolution_inner_dim_row_major_1d()
float* d_input;
float* d_kernel;
float* d_out;
- cudaMalloc((void**)(&d_input), input_bytes);
- cudaMalloc((void**)(&d_kernel), kernel_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_input), input_bytes);
+ gpuMalloc((void**)(&d_kernel), kernel_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
@@ -454,8 +455,8 @@ void test_cuda_convolution_inner_dim_row_major_1d()
Eigen::array<Eigen::DenseIndex, 1> dims(3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 7; ++i) {
for (int j = 0; j < 9; ++j) {
@@ -470,13 +471,13 @@ void test_cuda_convolution_inner_dim_row_major_1d()
}
}
- cudaFree(d_input);
- cudaFree(d_kernel);
- cudaFree(d_out);
+ gpuFree(d_input);
+ gpuFree(d_kernel);
+ gpuFree(d_out);
}
template<int DataLayout>
-void test_cuda_convolution_2d()
+void test_gpu_convolution_2d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 2, DataLayout> kernel(3,4);
@@ -491,14 +492,14 @@ void test_cuda_convolution_2d()
float* d_input;
float* d_kernel;
float* d_out;
- cudaMalloc((void**)(&d_input), input_bytes);
- cudaMalloc((void**)(&d_kernel), kernel_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_input), input_bytes);
+ gpuMalloc((void**)(&d_kernel), kernel_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
@@ -508,8 +509,8 @@ void test_cuda_convolution_2d()
Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 35; ++j) {
@@ -534,13 +535,13 @@ void test_cuda_convolution_2d()
}
}
- cudaFree(d_input);
- cudaFree(d_kernel);
- cudaFree(d_out);
+ gpuFree(d_input);
+ gpuFree(d_kernel);
+ gpuFree(d_out);
}
template<int DataLayout>
-void test_cuda_convolution_3d()
+void test_gpu_convolution_3d()
{
Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
Tensor<float, 3, DataLayout> kernel(3,4,2);
@@ -555,14 +556,14 @@ void test_cuda_convolution_3d()
float* d_input;
float* d_kernel;
float* d_out;
- cudaMalloc((void**)(&d_input), input_bytes);
- cudaMalloc((void**)(&d_kernel), kernel_bytes);
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_input), input_bytes);
+ gpuMalloc((void**)(&d_kernel), kernel_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
@@ -572,8 +573,8 @@ void test_cuda_convolution_3d()
Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 35; ++j) {
@@ -612,14 +613,15 @@ void test_cuda_convolution_3d()
}
}
- cudaFree(d_input);
- cudaFree(d_kernel);
- cudaFree(d_out);
+ gpuFree(d_input);
+ gpuFree(d_kernel);
+ gpuFree(d_out);
}
+#if EIGEN_GPU_TEST_C99_MATH
template <typename Scalar>
-void test_cuda_lgamma(const Scalar stddev)
+void test_gpu_lgamma(const Scalar stddev)
{
Tensor<Scalar, 2> in(72,97);
in.setRandom();
@@ -631,12 +633,12 @@ void test_cuda_lgamma(const Scalar stddev)
Scalar* d_in;
Scalar* d_out;
- cudaMalloc((void**)(&d_in), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_in), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -644,8 +646,8 @@ void test_cuda_lgamma(const Scalar stddev)
gpu_out.device(gpu_device) = gpu_in.lgamma();
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
@@ -653,12 +655,13 @@ void test_cuda_lgamma(const Scalar stddev)
}
}
- cudaFree(d_in);
- cudaFree(d_out);
+ gpuFree(d_in);
+ gpuFree(d_out);
}
+#endif
template <typename Scalar>
-void test_cuda_digamma()
+void test_gpu_digamma()
{
Tensor<Scalar, 1> in(7);
Tensor<Scalar, 1> out(7);
@@ -685,12 +688,12 @@ void test_cuda_digamma()
Scalar* d_in;
Scalar* d_out;
- cudaMalloc((void**)(&d_in), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_in), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
@@ -698,8 +701,8 @@ void test_cuda_digamma()
gpu_out.device(gpu_device) = gpu_in.digamma();
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 5; ++i) {
VERIFY_IS_APPROX(out(i), expected_out(i));
@@ -708,12 +711,12 @@ void test_cuda_digamma()
VERIFY_IS_EQUAL(out(i), expected_out(i));
}
- cudaFree(d_in);
- cudaFree(d_out);
+ gpuFree(d_in);
+ gpuFree(d_out);
}
template <typename Scalar>
-void test_cuda_zeta()
+void test_gpu_zeta()
{
Tensor<Scalar, 1> in_x(6);
Tensor<Scalar, 1> in_q(6);
@@ -747,14 +750,14 @@ void test_cuda_zeta()
Scalar* d_in_x;
Scalar* d_in_q;
Scalar* d_out;
- cudaMalloc((void**)(&d_in_x), bytes);
- cudaMalloc((void**)(&d_in_q), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_in_x), bytes);
+ gpuMalloc((void**)(&d_in_q), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
@@ -763,8 +766,8 @@ void test_cuda_zeta()
gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(out(0), expected_out(0));
VERIFY((std::isnan)(out(3)));
@@ -775,13 +778,13 @@ void test_cuda_zeta()
}
}
- cudaFree(d_in_x);
- cudaFree(d_in_q);
- cudaFree(d_out);
+ gpuFree(d_in_x);
+ gpuFree(d_in_q);
+ gpuFree(d_out);
}
template <typename Scalar>
-void test_cuda_polygamma()
+void test_gpu_polygamma()
{
Tensor<Scalar, 1> in_x(7);
Tensor<Scalar, 1> in_n(7);
@@ -818,14 +821,14 @@ void test_cuda_polygamma()
Scalar* d_in_x;
Scalar* d_in_n;
Scalar* d_out;
- cudaMalloc((void**)(&d_in_x), bytes);
- cudaMalloc((void**)(&d_in_n), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_in_x), bytes);
+ gpuMalloc((void**)(&d_in_n), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
@@ -834,20 +837,20 @@ void test_cuda_polygamma()
gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 7; ++i) {
VERIFY_IS_APPROX(out(i), expected_out(i));
}
- cudaFree(d_in_x);
- cudaFree(d_in_n);
- cudaFree(d_out);
+ gpuFree(d_in_x);
+ gpuFree(d_in_n);
+ gpuFree(d_out);
}
template <typename Scalar>
-void test_cuda_igamma()
+void test_gpu_igamma()
{
Tensor<Scalar, 2> a(6, 6);
Tensor<Scalar, 2> x(6, 6);
@@ -883,14 +886,14 @@ void test_cuda_igamma()
Scalar* d_a;
Scalar* d_x;
Scalar* d_out;
- assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
- assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
- assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+ assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
+ assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
+ assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
- cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -899,8 +902,8 @@ void test_cuda_igamma()
gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
@@ -912,13 +915,13 @@ void test_cuda_igamma()
}
}
- cudaFree(d_a);
- cudaFree(d_x);
- cudaFree(d_out);
+ gpuFree(d_a);
+ gpuFree(d_x);
+ gpuFree(d_out);
}
template <typename Scalar>
-void test_cuda_igammac()
+void test_gpu_igammac()
{
Tensor<Scalar, 2> a(6, 6);
Tensor<Scalar, 2> x(6, 6);
@@ -953,14 +956,14 @@ void test_cuda_igammac()
Scalar* d_a;
Scalar* d_x;
Scalar* d_out;
- cudaMalloc((void**)(&d_a), bytes);
- cudaMalloc((void**)(&d_x), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_a), bytes);
+ gpuMalloc((void**)(&d_x), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
@@ -969,8 +972,8 @@ void test_cuda_igammac()
gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
@@ -982,13 +985,14 @@ void test_cuda_igammac()
}
}
- cudaFree(d_a);
- cudaFree(d_x);
- cudaFree(d_out);
+ gpuFree(d_a);
+ gpuFree(d_x);
+ gpuFree(d_out);
}
+#if EIGEN_GPU_TEST_C99_MATH
template <typename Scalar>
-void test_cuda_erf(const Scalar stddev)
+void test_gpu_erf(const Scalar stddev)
{
Tensor<Scalar, 2> in(72,97);
in.setRandom();
@@ -1000,12 +1004,12 @@ void test_cuda_erf(const Scalar stddev)
Scalar* d_in;
Scalar* d_out;
- assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
- assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+ assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
+ assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
- cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1013,8 +1017,8 @@ void test_cuda_erf(const Scalar stddev)
gpu_out.device(gpu_device) = gpu_in.erf();
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
@@ -1022,12 +1026,12 @@ void test_cuda_erf(const Scalar stddev)
}
}
- cudaFree(d_in);
- cudaFree(d_out);
+ gpuFree(d_in);
+ gpuFree(d_out);
}
template <typename Scalar>
-void test_cuda_erfc(const Scalar stddev)
+void test_gpu_erfc(const Scalar stddev)
{
Tensor<Scalar, 2> in(72,97);
in.setRandom();
@@ -1039,12 +1043,12 @@ void test_cuda_erfc(const Scalar stddev)
Scalar* d_in;
Scalar* d_out;
- cudaMalloc((void**)(&d_in), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_in), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
@@ -1052,8 +1056,8 @@ void test_cuda_erfc(const Scalar stddev)
gpu_out.device(gpu_device) = gpu_in.erfc();
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
@@ -1061,12 +1065,73 @@ void test_cuda_erfc(const Scalar stddev)
}
}
- cudaFree(d_in);
- cudaFree(d_out);
+ gpuFree(d_in);
+ gpuFree(d_out);
+}
+#endif
+template <typename Scalar>
+void test_gpu_ndtri()
+{
+ Tensor<Scalar, 1> in_x(8);
+ Tensor<Scalar, 1> out(8);
+ Tensor<Scalar, 1> expected_out(8);
+ out.setZero();
+
+ in_x(0) = Scalar(1);
+ in_x(1) = Scalar(0.);
+ in_x(2) = Scalar(0.5);
+ in_x(3) = Scalar(0.2);
+ in_x(4) = Scalar(0.8);
+ in_x(5) = Scalar(0.9);
+ in_x(6) = Scalar(0.1);
+ in_x(7) = Scalar(0.99);
+ in_x(8) = Scalar(0.01);
+
+ expected_out(0) = std::numeric_limits<Scalar>::infinity();
+ expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+ expected_out(2) = Scalar(0.0);
+ expected_out(3) = Scalar(-0.8416212335729142);
+ expected_out(4) = Scalar(0.8416212335729142);
+ expected_out(5) = Scalar(1.2815515655446004);
+ expected_out(6) = Scalar(-1.2815515655446004);
+ expected_out(7) = Scalar(2.3263478740408408);
+ expected_out(8) = Scalar(-2.3263478740408408);
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in_x;
+ Scalar* d_out;
+ gpuMalloc((void**)(&d_in_x), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
+
+ gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+ gpu_out.device(gpu_device) = gpu_in_x.ndtri();
+
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+ VERIFY_IS_EQUAL(out(0), expected_out(0));
+ VERIFY((std::isnan)(out(3)));
+
+ for (int i = 1; i < 6; ++i) {
+ if (i != 3) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+ }
+
+ gpuFree(d_in_x);
+ gpuFree(d_out);
}
template <typename Scalar>
-void test_cuda_betainc()
+void test_gpu_betainc()
{
Tensor<Scalar, 1> in_x(125);
Tensor<Scalar, 1> in_a(125);
@@ -1175,16 +1240,16 @@ void test_cuda_betainc()
Scalar* d_in_a;
Scalar* d_in_b;
Scalar* d_out;
- cudaMalloc((void**)(&d_in_x), bytes);
- cudaMalloc((void**)(&d_in_a), bytes);
- cudaMalloc((void**)(&d_in_b), bytes);
- cudaMalloc((void**)(&d_out), bytes);
+ gpuMalloc((void**)(&d_in_x), bytes);
+ gpuMalloc((void**)(&d_in_a), bytes);
+ gpuMalloc((void**)(&d_in_b), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
- cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
@@ -1194,8 +1259,8 @@ void test_cuda_betainc()
gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
- assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (int i = 1; i < 125; ++i) {
if ((std::isnan)(expected_out(i))) {
@@ -1205,83 +1270,374 @@ void test_cuda_betainc()
}
}
- cudaFree(d_in_x);
- cudaFree(d_in_a);
- cudaFree(d_in_b);
- cudaFree(d_out);
+ gpuFree(d_in_x);
+ gpuFree(d_in_a);
+ gpuFree(d_in_b);
+ gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i0e()
+{
+ Tensor<Scalar, 1> in_x(21);
+ Tensor<Scalar, 1> out(21);
+ Tensor<Scalar, 1> expected_out(21);
+ out.setZero();
+
+ Array<Scalar, 1, Dynamic> in_x_array(21);
+ Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+ in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+ -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+ expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
+ 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
+ 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
+ 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
+ 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
+ 0.0897803118848;
+
+ for (int i = 0; i < 21; ++i) {
+ in_x(i) = in_x_array(i);
+ expected_out(i) = expected_out_array(i);
+ }
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in;
+ Scalar* d_out;
+ gpuMalloc((void**)(&d_in), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
+
+ gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+ gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
+
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+ for (int i = 0; i < 21; ++i) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+
+ gpuFree(d_in);
+ gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_i1e()
+{
+ Tensor<Scalar, 1> in_x(21);
+ Tensor<Scalar, 1> out(21);
+ Tensor<Scalar, 1> expected_out(21);
+ out.setZero();
+
+ Array<Scalar, 1, Dynamic> in_x_array(21);
+ Array<Scalar, 1, Dynamic> expected_out_array(21);
+
+ in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
+ -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
+
+ expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
+ -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
+ -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
+ 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
+ 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
+ 0.0875062221833;
+
+ for (int i = 0; i < 21; ++i) {
+ in_x(i) = in_x_array(i);
+ expected_out(i) = expected_out_array(i);
+ }
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_in;
+ Scalar* d_out;
+ gpuMalloc((void**)(&d_in), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
+
+ gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
+
+ gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
+
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+ for (int i = 0; i < 21; ++i) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+
+ gpuFree(d_in);
+ gpuFree(d_out);
}
+template <typename Scalar>
+void test_gpu_igamma_der_a()
+{
+ Tensor<Scalar, 1> in_x(30);
+ Tensor<Scalar, 1> in_a(30);
+ Tensor<Scalar, 1> out(30);
+ Tensor<Scalar, 1> expected_out(30);
+ out.setZero();
+
+ Array<Scalar, 1, Dynamic> in_a_array(30);
+ Array<Scalar, 1, Dynamic> in_x_array(30);
+ Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+ // See special_functions.cpp for the Python code that generates the test data.
+
+ in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+ 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+ in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+ 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+ 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+ 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+ 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+ 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+ 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+ expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
+ -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
+ -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
+ -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
+ -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
+ -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
+ -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
+ -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+ for (int i = 0; i < 30; ++i) {
+ in_x(i) = in_x_array(i);
+ in_a(i) = in_a_array(i);
+ expected_out(i) = expected_out_array(i);
+ }
+
+ std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+ Scalar* d_a;
+ Scalar* d_x;
+ Scalar* d_out;
+ gpuMalloc((void**)(&d_a), bytes);
+ gpuMalloc((void**)(&d_x), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
+
+ gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+ gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
+
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+ for (int i = 0; i < 30; ++i) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
-void test_cxx11_tensor_cuda()
+ gpuFree(d_a);
+ gpuFree(d_x);
+ gpuFree(d_out);
+}
+
+template <typename Scalar>
+void test_gpu_gamma_sample_der_alpha()
{
- CALL_SUBTEST_1(test_cuda_nullary());
- CALL_SUBTEST_1(test_cuda_elementwise_small());
- CALL_SUBTEST_1(test_cuda_elementwise());
- CALL_SUBTEST_1(test_cuda_props());
- CALL_SUBTEST_1(test_cuda_reduction());
- CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
- CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
- CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
- CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
- CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
- CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
- CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
- CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
- CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
- CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
-
-#if __cplusplus > 199711L
+ Tensor<Scalar, 1> in_alpha(30);
+ Tensor<Scalar, 1> in_sample(30);
+ Tensor<Scalar, 1> out(30);
+ Tensor<Scalar, 1> expected_out(30);
+ out.setZero();
+
+ Array<Scalar, 1, Dynamic> in_alpha_array(30);
+ Array<Scalar, 1, Dynamic> in_sample_array(30);
+ Array<Scalar, 1, Dynamic> expected_out_array(30);
+
+ // See special_functions.cpp for the Python code that generates the test data.
+
+ in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
+ 100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+ in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+ 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
+ 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
+ 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
+ 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
+ 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
+ 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+ expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+ 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+ 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+ 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+ 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+ 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+ 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+ 1.00106492525, 0.97734200649, 1.02198794179;
+
+ for (int i = 0; i < 30; ++i) {
+ in_alpha(i) = in_alpha_array(i);
+ in_sample(i) = in_sample_array(i);
+ expected_out(i) = expected_out_array(i);
+ }
+
+ std::size_t bytes = in_alpha.size() * sizeof(Scalar);
+
+ Scalar* d_alpha;
+ Scalar* d_sample;
+ Scalar* d_out;
+ gpuMalloc((void**)(&d_alpha), bytes);
+ gpuMalloc((void**)(&d_sample), bytes);
+ gpuMalloc((void**)(&d_out), bytes);
+
+ gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
+ Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
+
+ gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
+
+ assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+ for (int i = 0; i < 30; ++i) {
+ VERIFY_IS_APPROX(out(i), expected_out(i));
+ }
+
+ gpuFree(d_alpha);
+ gpuFree(d_sample);
+ gpuFree(d_out);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
+{
+ CALL_SUBTEST_1(test_gpu_nullary());
+ CALL_SUBTEST_1(test_gpu_elementwise_small());
+ CALL_SUBTEST_1(test_gpu_elementwise());
+ CALL_SUBTEST_1(test_gpu_props());
+ CALL_SUBTEST_1(test_gpu_reduction());
+ CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
+ CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
+ CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
+ CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
+ CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
+ CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
+ CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
+ CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+// they hang..need to investigate and fix
+ CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
+ CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
+#endif
+
+#if EIGEN_GPU_TEST_C99_MATH
// std::erf, std::erfc, and so on where only added in c++11. We use them
// as a golden reference to validate the results produced by Eigen. Therefore
// we can only run these tests if we use a c++11 compiler.
- CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
- CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
- CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
- CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
-
- CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
- CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
- CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
- CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
-
- CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
- CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
- CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
- CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
-
- CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
- // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
- CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
- CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
- CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
-
- CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
- CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
- CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
- CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
-
- CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
- // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
- CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
- CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
- CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
-
- CALL_SUBTEST_5(test_cuda_digamma<float>());
- CALL_SUBTEST_5(test_cuda_digamma<double>());
-
- CALL_SUBTEST_5(test_cuda_polygamma<float>());
- CALL_SUBTEST_5(test_cuda_polygamma<double>());
-
- CALL_SUBTEST_5(test_cuda_zeta<float>());
- CALL_SUBTEST_5(test_cuda_zeta<double>());
-
- CALL_SUBTEST_5(test_cuda_igamma<float>());
- CALL_SUBTEST_5(test_cuda_igammac<float>());
-
- CALL_SUBTEST_5(test_cuda_igamma<double>());
- CALL_SUBTEST_5(test_cuda_igammac<double>());
-
- CALL_SUBTEST_6(test_cuda_betainc<float>());
- CALL_SUBTEST_6(test_cuda_betainc<double>());
+ CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
+ CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
+ CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
+ CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
+
+ CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
+ CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
+ CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
+ CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
+
+ CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
+ CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
+ CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
+ CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
+
+ CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
+ // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
+ CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
+ CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
+ CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
+
+ CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
+ CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
+ CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
+ CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
+
+ CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
+ // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
+ CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
+ CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
+ CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+
+ CALL_SUBTEST_5(test_gpu_ndtri<float>());
+ CALL_SUBTEST_5(test_gpu_ndtri<double>());
+
+ CALL_SUBTEST_5(test_gpu_digamma<float>());
+ CALL_SUBTEST_5(test_gpu_digamma<double>());
+
+ CALL_SUBTEST_5(test_gpu_polygamma<float>());
+ CALL_SUBTEST_5(test_gpu_polygamma<double>());
+
+ CALL_SUBTEST_5(test_gpu_zeta<float>());
+ CALL_SUBTEST_5(test_gpu_zeta<double>());
+#endif
+
+ CALL_SUBTEST_5(test_gpu_igamma<float>());
+ CALL_SUBTEST_5(test_gpu_igammac<float>());
+
+ CALL_SUBTEST_5(test_gpu_igamma<double>());
+ CALL_SUBTEST_5(test_gpu_igammac<double>());
+
+#if !defined(EIGEN_USE_HIP)
+// disable these tests on HIP for now.
+ CALL_SUBTEST_6(test_gpu_betainc<float>());
+ CALL_SUBTEST_6(test_gpu_betainc<double>());
+
+ CALL_SUBTEST_6(test_gpu_i0e<float>());
+ CALL_SUBTEST_6(test_gpu_i0e<double>());
+
+ CALL_SUBTEST_6(test_gpu_i1e<float>());
+ CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+ CALL_SUBTEST_6(test_gpu_i1e<float>());
+ CALL_SUBTEST_6(test_gpu_i1e<double>());
+
+ CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
+ CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
+
+ CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
+ CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
+#endif
+
#endif
}
diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp
index 5fd88fa6c..c20edd9ac 100644
--- a/unsupported/test/cxx11_tensor_ifft.cpp
+++ b/unsupported/test/cxx11_tensor_ifft.cpp
@@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3)
}
}
-void test_cxx11_tensor_ifft() {
+EIGEN_DECLARE_TEST(cxx11_tensor_ifft) {
CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
new file mode 100644
index 000000000..db1c0206e
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device)
+{
+ IndexType sizeDim1 = 245;
+ IndexType sizeDim2 = 343;
+ IndexType sizeDim3 = 577;
+
+ array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}};
+
+ Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range);
+ Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range);
+ Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range);
+ Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range);
+
+
+
+ typedef Eigen::DSizes<IndexType, 3> Index3;
+ Index3 strides1(1L,1L, 1L);
+ Index3 indicesStart1(1L, 0L, 0L);
+ Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3);
+
+ Index3 strides2(1L,1L, 1L);
+ Index3 indicesStart2(0L, 0L, 0L);
+ Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3);
+ Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3);
+
+ tensor1.setRandom();
+ tensor2.setRandom();
+
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType));
+ gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes);
+ sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+ tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2);
+
+
+ for (IndexType i = 0; i <slice_range[0] ; ++i) {
+ for (IndexType j = 0; j < slice_range[1]; ++j) {
+ for (IndexType k = 0; k < slice_range[2]; ++k) {
+ VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+ sycl_device.deallocate(gpu_data3);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+ CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
+#endif
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp
index 475c59651..862f1f7f0 100644
--- a/unsupported/test/cxx11_tensor_image_patch.cpp
+++ b/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -405,6 +405,57 @@ void test_patch_padding_same()
}
}
+// Verifies that SAME padding, when computed as negative values, will be clipped
+// to zero.
+void test_patch_padding_same_negative_padding_clip_to_zero() {
+ int input_depth = 1;
+ int input_rows = 15;
+ int input_cols = 1;
+ int input_batches = 1;
+ int ksize = 1; // Corresponds to the Rows and Cols for
+ // tensor.extract_image_patches<>.
+ int row_stride = 5;
+ int col_stride = 1;
+ // ColMajor
+ Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+ // Initializes tensor with incrementing numbers.
+ for (int i = 0; i < tensor.size(); ++i) {
+ tensor.data()[i] = i + 1;
+ }
+ Tensor<float, 5> result = tensor.extract_image_patches(
+ ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME);
+ // row padding will be computed as -2 originally and then be clipped to 0.
+ VERIFY_IS_EQUAL(result.coeff(0), 1.0f);
+ VERIFY_IS_EQUAL(result.coeff(1), 6.0f);
+ VERIFY_IS_EQUAL(result.coeff(2), 11.0f);
+
+ VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result.dimension(3), 3); // number of patches
+ VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+ Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+ VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+ Tensor<float, 5, RowMajor> result_row_major =
+ tensor_row_major.extract_image_patches(ksize, ksize, row_stride,
+ col_stride, 1, 1, PADDING_SAME);
+ VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f);
+ VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f);
+ VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f);
+
+ VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+}
+
void test_patch_no_extra_dim()
{
Tensor<float, 3> tensor(2,3,5);
@@ -746,7 +797,7 @@ void test_imagenet_patches()
}
}
-void test_cxx11_tensor_image_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch)
{
CALL_SUBTEST_1(test_simple_patch());
CALL_SUBTEST_2(test_patch_no_extra_dim());
@@ -754,4 +805,5 @@ void test_cxx11_tensor_image_patch()
CALL_SUBTEST_4(test_patch_padding_valid_same_value());
CALL_SUBTEST_5(test_patch_padding_same());
CALL_SUBTEST_6(test_imagenet_patches());
+ CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero());
}
diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
new file mode 100644
index 000000000..c1828a0ec
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp
@@ -0,0 +1,1092 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+ tensor_col_major.setRandom();
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ // Single pixel patch: ColMajor
+ array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+ gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7);
+
+ // Single pixel patch: RowMajor
+ array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+ gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2);
+
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ // ColMajor
+ if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+ std::cout << "Mismatch detected at index colmajor " << i << " : "
+ << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i]
+ << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+ // RowMajor
+ if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+ std::cout << "Mismatch detected at index row major" << i << " : "
+ << tensor_row_major.data()[i] << " vs "
+ << single_patch_row_major.data()[i] << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+ tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+ single_patch_row_major.data()[i]);
+ }
+
+
+ // Entire image patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+ gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+ sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7);
+
+ // Entire image patch: RowMajor
+ patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+ gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+ sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 3; ++r) {
+ for (IndexType c = 0; c < 5; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ for (IndexType b = 0; b < 7; ++b) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+ expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b);
+ expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+ }
+ // ColMajor
+ if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+ expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j
+ << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+ << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+ expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // 2D patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+ gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7);
+
+ // 2D patch: RowMajor
+ patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+ gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+ // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+ IndexType stride = 1;
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 2; ++r) {
+ for (IndexType c = 0; c < 2; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ for (IndexType b = 0; b < 7; ++b) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r*stride + i - row_padding;
+ IndexType col_offset = c*stride + j - col_padding;
+ // ColMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ }
+ if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major);
+
+ // RowMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+ }
+ if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_single_patch_col_major);
+ sycl_device.deallocate(gpu_data_single_patch_row_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+ sycl_device.deallocate(gpu_data_twod_patch_col_major);
+ sycl_device.deallocate(gpu_data_twod_patch_row_major);
+
+}
+
+
+// Verifies VALID padding (no padding) with incrementing values.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){
+ IndexType input_depth = 3;
+ IndexType input_rows = 3;
+ IndexType input_cols = 3;
+ IndexType input_batches = 1;
+ IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+ IndexType stride = 2; // Only same stride is supported.
+
+ array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+ array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ // Initializes tensor with incrementing numbers.
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ tensor_col_major.data()[i] = i + 1;
+ }
+ // ColMajor
+ array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}};
+ Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+ gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+ array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }};
+ Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+ gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+ // No padding is carried out.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+
+ for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows
+ for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols
+ IndexType patchId = i+input_rows*j;
+ for (IndexType r = 0; r < ksize; ++r) { // patch rows
+ for (IndexType c = 0; c < ksize; ++c) { // patch cols
+ for (IndexType d = 0; d < input_depth; ++d) { // depth
+ for (IndexType b = 0; b < input_batches; ++b) { // batch
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r + i - row_padding;
+ IndexType col_offset = c + j - col_padding;
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+ }
+ // ColMajor
+ if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_result_col_major);
+ sycl_device.deallocate(gpu_data_result_row_major);
+}
+
+// Verifies VALID padding (no padding) with the same value.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){
+ IndexType input_depth = 1;
+ IndexType input_rows = 5;
+ IndexType input_cols = 5;
+ IndexType input_batches = 2;
+ IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+ IndexType stride = 2; // Only same stride is supported.
+ // ColMajor
+
+ array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+ array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+ gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f);
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}};
+ Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+ gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+ array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }};
+ Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+ gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+ sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+ // No padding is carried out.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+
+ for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows
+ for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols
+ IndexType patchId = i+input_rows*j;
+ for (IndexType r = 0; r < ksize; ++r) { // patch rows
+ for (IndexType c = 0; c < ksize; ++c) { // patch cols
+ for (IndexType d = 0; d < input_depth; ++d) { // depth
+ for (IndexType b = 0; b < input_batches; ++b) { // batch
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r + i - row_padding;
+ IndexType col_offset = c + j - col_padding;
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+ }
+ // ColMajor
+ if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+// Verifies SAME padding.
+template <typename DataType, typename IndexType>
+static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){
+ IndexType input_depth = 3;
+ IndexType input_rows = 4;
+ IndexType input_cols = 2;
+ IndexType input_batches = 1;
+ IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+ IndexType stride = 2; // Only same stride is supported.
+
+ // ColMajor
+ array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}};
+ array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0));
+
+ // Initializes tensor with incrementing numbers.
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ tensor_col_major.data()[i] = i + 1;
+ }
+
+array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}};
+Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange);
+size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType);
+DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange);
+gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize);
+
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches
+
+ // RowMajor
+
+ array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }};
+ Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =result_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange);
+ gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+ sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4));
+ VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3));
+ VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2));
+ VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1));
+ VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0));
+
+ // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+
+ for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows
+ for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols
+ IndexType patchId = i+input_rows*j;
+ for (IndexType r = 0; r < ksize; ++r) { // patch rows
+ for (IndexType c = 0; c < ksize; ++c) { // patch cols
+ for (IndexType d = 0; d < input_depth; ++d) { // depth
+ for (IndexType b = 0; b < input_batches; ++b) { // batch
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r*stride + i - row_padding;
+ IndexType col_offset = c*stride + j - col_padding;
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset, b);
+ expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+ }
+ // ColMajor
+ if (result_col_major(d, r, c, patchId, b) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major);
+ // RowMajor
+ if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+template <typename DataType, typename IndexType>
+static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+
+ // ColMajor
+ array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}};
+ Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ tensor_col_major.setRandom();
+ Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1));
+ VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0));
+
+
+ // Single pixel patch: ColMajor
+ array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}};
+ Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange);
+ gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3);
+
+ // Single pixel patch: RowMajor
+ array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}};
+ Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange);
+ gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1);
+ sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1);
+
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ // ColMajor
+ if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) {
+ std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]);
+ // RowMajor
+ if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) {
+ std::cout << "Mismatch detected at index " << i << " : "
+ << tensor_col_major.data()[i] << " vs "
+ << single_patch_row_major.data()[i] << std::endl;
+ }
+ VERIFY_IS_EQUAL(single_patch_row_major.data()[i],
+ tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+ VERIFY_IS_EQUAL(single_patch_col_major.data()[i],
+ single_patch_row_major.data()[i]);
+ }
+
+ // Entire image patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}};
+ Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange);
+ gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5);
+ sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5);
+
+ // Entire image patch: RowMajor
+patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}};
+Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange);
+patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType);
+DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange);
+gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5);
+sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+ VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 3; ++r) {
+ for (IndexType c = 0; c < 5; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+ expected_col_major = tensor_col_major(d, r-1+i, c-2+j);
+ expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+ }
+ // ColMajor
+ if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major);
+ // RowMajor
+ if (entire_image_patch_row_major(patchId, c, r, d) !=
+ expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+ expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+
+ // 2D patch: ColMajor
+ patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}};
+ Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange);
+ patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange);
+ gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5);
+
+ // 2D patch: RowMajor
+ patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}};
+ Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange);
+ gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2);
+ sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+ // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+ IndexType row_padding = 0;
+ IndexType col_padding = 0;
+ IndexType stride = 1;
+
+ for (IndexType i = 0; i < 3; ++i) {
+ for (IndexType j = 0; j < 5; ++j) {
+ IndexType patchId = i+3*j;
+ for (IndexType r = 0; r < 2; ++r) {
+ for (IndexType c = 0; c < 2; ++c) {
+ for (IndexType d = 0; d < 2; ++d) {
+ DataType expected_col_major = 0.0f;
+ DataType expected_row_major = 0.0f;
+ IndexType row_offset = r*stride + i - row_padding;
+ IndexType col_offset = c*stride + j - col_padding;
+ // ColMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) {
+ expected_col_major = tensor_col_major(d, row_offset, col_offset);
+ }
+ if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major);
+ // RowMajor
+ if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+ expected_row_major = tensor_row_major(col_offset, row_offset, d);
+ }
+ if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+ }
+ VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+ // Check that ColMajor and RowMajor agree.
+ VERIFY_IS_EQUAL(expected_col_major, expected_row_major);
+ }
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_single_patch_col_major);
+ sycl_device.deallocate(gpu_data_single_patch_row_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_col_major);
+ sycl_device.deallocate(gpu_data_entire_image_patch_row_major);
+ sycl_device.deallocate(gpu_data_twod_patch_col_major);
+ sycl_device.deallocate(gpu_data_twod_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ // Test the code on typical configurations used by the 'imagenet' benchmarks at
+ // https://github.com/soumith/convnet-benchmarks
+ // ColMajor
+ IndexType sizeDim1 = 3;
+ IndexType sizeDim2 = 128;
+ IndexType sizeDim3 = 128;
+ IndexType sizeDim4 = 16;
+ array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange);
+ l_in_col_major.setRandom();
+
+ DataType* gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+
+ array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange);
+ size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange);
+ gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4);
+
+ // RowMajor
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}};
+ Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1);
+
+ for (IndexType b = 0; b < 16; ++b) {
+ for (IndexType i = 0; i < 128; ++i) {
+ for (IndexType j = 0; j < 128; ++j) {
+ IndexType patchId = i+128*j;
+ for (IndexType c = 0; c < 11; ++c) {
+ for (IndexType r = 0; r < 11; ++r) {
+ for (IndexType d = 0; d < 3; ++d) {
+ DataType expected = 0.0f;
+ if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+ expected = l_in_col_major(d, r-5+i, c-5+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) !=
+ expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j
+ << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+ << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+ expected);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ColMajor
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sizeDim1 = 16;
+ sizeDim2 = 64;
+ sizeDim3 = 64;
+ sizeDim4 = 32;
+ tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ l_in_col_major.resize(tensorColMajorRange);
+ l_in_col_major.setRandom();
+ gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}};
+ l_out_col_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+ gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+// RowMajor
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}};
+ l_out_row_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+ for (IndexType b = 0; b < 32; ++b) {
+ for (IndexType i = 0; i < 64; ++i) {
+ for (IndexType j = 0; j < 64; ++j) {
+ IndexType patchId = i+64*j;
+ for (IndexType c = 0; c < 9; ++c) {
+ for (IndexType r = 0; r < 9; ++r) {
+ for (IndexType d = 0; d < 16; ++d) {
+ DataType expected = 0.0f;
+ if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+ expected = l_in_col_major(d, r-4+i, c-4+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ColMajor
+
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sizeDim1 = 32;
+ sizeDim2 = 16;
+ sizeDim3 = 16;
+ sizeDim4 = 32;
+ tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ l_in_col_major.resize(tensorColMajorRange);
+ l_in_col_major.setRandom();
+ gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}};
+ l_out_col_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+ gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+ // RowMajor
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}};
+ l_out_row_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+ for (IndexType b = 0; b < 32; ++b) {
+ for (IndexType i = 0; i < 16; ++i) {
+ for (IndexType j = 0; j < 16; ++j) {
+ IndexType patchId = i+16*j;
+ for (IndexType c = 0; c < 7; ++c) {
+ for (IndexType r = 0; r < 7; ++r) {
+ for (IndexType d = 0; d < 32; ++d) {
+ DataType expected = 0.0f;
+ if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+ expected = l_in_col_major(d, r-3+i, c-3+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // ColMajor
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sizeDim1 = 64;
+ sizeDim2 = 13;
+ sizeDim3 = 13;
+ sizeDim4 = 32;
+ tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ l_in_col_major.resize(tensorColMajorRange);
+ l_in_col_major.setRandom();
+ gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange);
+
+ patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}};
+ l_out_col_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType);
+ gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType));
+ gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3);
+ sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13);
+ VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32);
+
+ // RowMajor
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+ patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}};
+ l_out_row_major.resize(patchTensorRange);
+ patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType);
+ gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange);
+ gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3);
+ sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+ VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+ for (IndexType b = 0; b < 32; ++b) {
+ for (IndexType i = 0; i < 13; ++i) {
+ for (IndexType j = 0; j < 13; ++j) {
+ IndexType patchId = i+13*j;
+ for (IndexType c = 0; c < 3; ++c) {
+ for (IndexType r = 0; r < 3; ++r) {
+ for (IndexType d = 0; d < 64; ++d) {
+ DataType expected = 0.0f;
+ if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+ expected = l_in_col_major(d, r-1+i, c-1+j, b);
+ }
+ // ColMajor
+ if (l_out_col_major(d, r, c, patchId, b) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected);
+ // RowMajor
+ if (l_out_row_major(b, patchId, c, r, d) != expected) {
+ std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+ }
+ VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_l_in_col_major);
+ sycl_device.deallocate(gpu_data_l_out_col_major);
+ sycl_device.deallocate(gpu_data_l_out_row_major);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+test_simple_image_patch_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device);
+test_patch_padding_same_sycl<DataType, int64_t>(sycl_device);
+test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device);
+test_imagenet_patches_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp
index 4cf5df666..2166532c8 100644
--- a/unsupported/test/cxx11_tensor_index_list.cpp
+++ b/unsupported/test/cxx11_tensor_index_list.cpp
@@ -22,9 +22,9 @@ static void test_static_index_list()
VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -167,19 +167,18 @@ static void test_type2indexpair_list()
typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
- typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
- typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+ typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<Index>, Eigen::type2indexpair<2,12>> Dims2_b;
+ typedef Eigen::IndexPairList<Eigen::IndexPair<Index>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<Index>> Dims2_c;
- Dims0 d0;
Dims2_a d2_a;
Dims2_b d2_b;
- d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+ d2_b.set(1, Eigen::IndexPair<Index>(1,11));
Dims2_c d2_c;
- d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
- d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11)); // setting type2indexpair to correct value.
- d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+ d2_c.set(0, Eigen::IndexPair<Index>(Eigen::IndexPair<Index>(0,10)));
+ d2_c.set(1, Eigen::IndexPair<Index>(1,11)); // setting type2indexpair to correct value.
+ d2_c.set(2, Eigen::IndexPair<Index>(2,12));
VERIFY_IS_EQUAL(d2_a[0].first, 0);
VERIFY_IS_EQUAL(d2_a[0].second, 10);
@@ -278,9 +277,9 @@ static void test_dynamic_index_list()
VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 2);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 0);
Tensor<float, 1> result = tensor.sum(reduction_axis);
for (int i = 0; i < result.size(); ++i) {
@@ -310,10 +309,10 @@ static void test_mixed_index_list()
VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
- VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2);
+ VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[3]), 3);
typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
ReductionIndices reduction_indices;
@@ -373,7 +372,7 @@ static void test_dim_check()
#endif
-void test_cxx11_tensor_index_list()
+EIGEN_DECLARE_TEST(cxx11_tensor_index_list)
{
#ifdef EIGEN_HAS_INDEX_LIST
CALL_SUBTEST(test_static_index_list());
diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp
index 4997935e9..75089e856 100644
--- a/unsupported/test/cxx11_tensor_inflation.cpp
+++ b/unsupported/test/cxx11_tensor_inflation.cpp
@@ -74,7 +74,7 @@ static void test_simple_inflation()
}
}
-void test_cxx11_tensor_inflation()
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation)
{
CALL_SUBTEST(test_simple_inflation<ColMajor>());
CALL_SUBTEST(test_simple_inflation<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
new file mode 100644
index 000000000..521ae0cc3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+// Inflation Definition for each dimension the inflated val would be
+//((dim-1)*strid[dim] +1)
+
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// tensor of size (2*3) +1 = 7 with the value of
+// (4, 0, 0, 4, 0, 0, 4).
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) {
+
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange);
+ tensor.setRandom();
+
+ array<IndexType, 4> strides;
+ strides[0] = 1;
+ strides[1] = 1;
+ strides[2] = 1;
+ strides[3] = 1;
+
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_no_stride = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides);
+ sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize);
+
+ VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+ }
+ }
+ }
+ }
+
+
+ strides[0] = 2;
+ strides[1] = 4;
+ strides[2] = 2;
+ strides[3] = 3;
+
+ IndexType inflatedSizeDim1 = 3;
+ IndexType inflatedSizeDim2 = 9;
+ IndexType inflatedSizeDim3 = 9;
+ IndexType inflatedSizeDim4 = 19;
+ array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}};
+
+ Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange);
+
+ const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType);
+ DataType* gpu_data_inflated = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange);
+ gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides);
+ sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize);
+
+ VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1);
+ VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2);
+ VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3);
+ VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4);
+
+ for (IndexType i = 0; i < inflatedSizeDim1; ++i) {
+ for (IndexType j = 0; j < inflatedSizeDim2; ++j) {
+ for (IndexType k = 0; k < inflatedSizeDim3; ++k) {
+ for (IndexType l = 0; l < inflatedSizeDim4; ++l) {
+ if (i % strides[0] == 0 &&
+ j % strides[1] == 0 &&
+ k % strides[2] == 0 &&
+ l % strides[3] == 0) {
+ VERIFY_IS_EQUAL(inflated(i,j,k,l),
+ tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3]));
+ } else {
+ VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_no_stride);
+ sycl_device.deallocate(gpu_data_inflated);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp
index 8e2b70b75..d18a05ec4 100644
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -135,7 +135,7 @@ void test_specific() {
VERIFY_IS_EQUAL(result, result_op);
}
-void test_cxx11_tensor_intdiv()
+EIGEN_DECLARE_TEST(cxx11_tensor_intdiv)
{
CALL_SUBTEST_1(test_signed_32bit());
CALL_SUBTEST_2(test_unsigned_32bit());
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 489960529..2c638f9bf 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -119,7 +119,7 @@ static void test_output_const()
}
-void test_cxx11_tensor_io()
+EIGEN_DECLARE_TEST(cxx11_tensor_io)
{
CALL_SUBTEST(test_output_0d<ColMajor>());
CALL_SUBTEST(test_output_0d<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp
index ae297a9da..efb333360 100644
--- a/unsupported/test/cxx11_tensor_layout_swap.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -54,7 +54,7 @@ static void test_swap_as_lvalue()
}
-void test_cxx11_tensor_layout_swap()
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap)
{
CALL_SUBTEST(test_simple_swap());
CALL_SUBTEST(test_swap_as_lvalue());
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
new file mode 100644
index 000000000..9546b911c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -0,0 +1,126 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, typename IndexType>
+static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 7;
+ array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+
+ Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+ Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+ tensor1.setRandom();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+ TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+// Tensor<float, 3, ColMajor> tensor(2,3,7);
+ //tensor.setRandom();
+
+// Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+ VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+ VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+ VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 7; ++k) {
+ VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, typename IndexType>
+static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 7;
+ array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}};
+
+ Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange);
+ Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange);
+ tensor1.setRandom();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange);
+ TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+ gpu2.swap_layout().device(sycl_device)=gpu1;
+ sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+
+// Tensor<float, 3, ColMajor> tensor(2,3,7);
+// tensor.setRandom();
+
+ //Tensor<float, 3, RowMajor> tensor2(7,3,2);
+// tensor2.swap_layout() = tensor;
+ VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2));
+ VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1));
+ VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 7; ++k) {
+ VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_swap_sycl<DataType, int64_t>(sycl_device);
+ test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp
index 071f5b406..6ba9a212d 100644
--- a/unsupported/test/cxx11_tensor_lvalue.cpp
+++ b/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -36,7 +36,7 @@ static void test_compound_assignment()
}
-void test_cxx11_tensor_lvalue()
+EIGEN_DECLARE_TEST(cxx11_tensor_lvalue)
{
CALL_SUBTEST(test_compound_assignment());
}
diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp
index 3db0ee7c0..4d4f68911 100644
--- a/unsupported/test/cxx11_tensor_map.cpp
+++ b/unsupported/test/cxx11_tensor_map.cpp
@@ -19,8 +19,8 @@ static void test_0d()
Tensor<int, 0> scalar1;
Tensor<int, 0, RowMajor> scalar2;
- TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
- TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+ TensorMap<const Tensor<int, 0> > scalar3(scalar1.data());
+ TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
scalar1() = 7;
scalar2() = 13;
@@ -37,8 +37,8 @@ static void test_1d()
Tensor<int, 1> vec1(6);
Tensor<int, 1, RowMajor> vec2(6);
- TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
- TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+ TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6);
+ TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6);
vec1(0) = 4; vec2(0) = 0;
vec1(1) = 8; vec2(1) = 1;
@@ -85,8 +85,8 @@ static void test_2d()
mat2(1,1) = 4;
mat2(1,2) = 5;
- TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
- TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+ TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3);
+ TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
VERIFY_IS_EQUAL(mat3.rank(), 2);
VERIFY_IS_EQUAL(mat3.size(), 6);
@@ -129,8 +129,8 @@ static void test_3d()
}
}
- TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
- TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+ TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7);
+ TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
VERIFY_IS_EQUAL(mat3.rank(), 3);
VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
@@ -265,7 +265,54 @@ static void test_casting()
VERIFY_IS_EQUAL(sum1, 861);
}
-void test_cxx11_tensor_map()
+template<typename T>
+static const T& add_const(T& value) {
+ return value;
+}
+
+static void test_0d_const_tensor()
+{
+ Tensor<int, 0> scalar1;
+ Tensor<int, 0, RowMajor> scalar2;
+
+ TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data());
+ TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data());
+
+ scalar1() = 7;
+ scalar2() = 13;
+
+ VERIFY_IS_EQUAL(scalar1.rank(), 0);
+ VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+ VERIFY_IS_EQUAL(scalar3(), 7);
+ VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_0d_const_tensor_map()
+{
+ Tensor<int, 0> scalar1;
+ Tensor<int, 0, RowMajor> scalar2;
+
+ const TensorMap<Tensor<int, 0> > scalar3(scalar1.data());
+ const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data());
+
+ // Although TensorMap is constant, we still can write to the underlying
+ // storage, because we map over non-constant Tensor.
+ scalar3() = 7;
+ scalar4() = 13;
+
+ VERIFY_IS_EQUAL(scalar1(), 7);
+ VERIFY_IS_EQUAL(scalar2(), 13);
+
+ // Pointer to the underlying storage is also non-const.
+ scalar3.data()[0] = 8;
+ scalar4.data()[0] = 14;
+
+ VERIFY_IS_EQUAL(scalar1(), 8);
+ VERIFY_IS_EQUAL(scalar2(), 14);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_map)
{
CALL_SUBTEST(test_0d());
CALL_SUBTEST(test_1d());
@@ -274,4 +321,7 @@ void test_cxx11_tensor_map()
CALL_SUBTEST(test_from_tensor());
CALL_SUBTEST(test_casting());
+
+ CALL_SUBTEST(test_0d_const_tensor());
+ CALL_SUBTEST(test_0d_const_tensor_map());
}
diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp
index 61c742a16..82a1a26d8 100644
--- a/unsupported/test/cxx11_tensor_math.cpp
+++ b/unsupported/test/cxx11_tensor_math.cpp
@@ -39,7 +39,7 @@ static void test_sigmoid()
}
-void test_cxx11_tensor_math()
+EIGEN_DECLARE_TEST(cxx11_tensor_math)
{
CALL_SUBTEST(test_tanh());
CALL_SUBTEST(test_sigmoid());
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
new file mode 100644
index 000000000..029653e27
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+ IndexType sizeDim1 = 4;
+ IndexType sizeDim2 = 4;
+ IndexType sizeDim3 = 1;
+ array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+ in = in.random();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+ gpu2.device(sycl_device) = gpu1.tanh();
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+ out_cpu=in.tanh();
+
+ for (int i = 0; i < in.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), out_cpu(i));
+ }
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device)
+{
+
+ IndexType sizeDim1 = 4;
+ IndexType sizeDim2 = 4;
+ IndexType sizeDim3 = 1;
+ array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange);
+
+ in = in.random();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType));
+ gpu2.device(sycl_device) = gpu1.sigmoid();
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType));
+
+ out_cpu=in.sigmoid();
+
+ for (int i = 0; i < in.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), out_cpu(i));
+ }
+}
+
+
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp
index 4fba6fdd1..ee2616fd7 100644
--- a/unsupported/test/cxx11_tensor_mixed_indices.cpp
+++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -47,7 +47,7 @@ static void test_simple()
}
-void test_cxx11_tensor_mixed_indices()
+EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices)
{
CALL_SUBTEST(test_simple());
}
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index f7de43110..ed5d5ade3 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -41,7 +41,29 @@ static void test_simple_reshape()
}
}
-template<typename>
+template <typename>
+static void test_static_reshape() {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ using Eigen::type2index;
+
+ Tensor<float, 5> tensor(2, 3, 1, 7, 1);
+ tensor.setRandom();
+
+ // New dimensions: [2, 3, 7]
+ Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim;
+ Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<ptrdiff_t,3>>(dim));
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 7; ++k) {
+ VERIFY_IS_EQUAL(tensor(i, j, 0, k, 0), reshaped(i, j, k));
+ }
+ }
+ }
+#endif
+}
+
+template <typename>
static void test_reshape_in_expr() {
MatrixXf m1(2,3*5*7*11);
MatrixXf m2(3*5*7*11,13);
@@ -90,19 +112,19 @@ static void test_reshape_as_lvalue()
}
}
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_simple_slice()
{
- Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+ Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
tensor.setRandom();
- Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+ Tensor<T, 5, DataLayout> slice1(1,1,1,1,1);
Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
slice1 = tensor.slice(indices, sizes);
VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
- Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+ Tensor<T, 5, DataLayout> slice2(1,1,2,2,3);
Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
slice2 = tensor.slice(indices2, sizes2);
@@ -115,20 +137,20 @@ static void test_simple_slice()
}
}
-template<typename=void>
+template<typename T>
static void test_const_slice()
{
- const float b[1] = {42};
- TensorMap<Tensor<const float, 1> > m(b, 1);
+ const T b[1] = {42};
+ TensorMap<Tensor<const T, 1> > m(b, 1);
DSizes<DenseIndex, 1> offsets;
offsets[0] = 0;
- TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+ TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions()));
VERIFY_IS_EQUAL(slice_ref(0), 42);
}
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_slice_in_expr() {
- typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+ typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx;
Mtx m1(7,7);
Mtx m2(3,3);
m1.setRandom();
@@ -136,10 +158,10 @@ static void test_slice_in_expr() {
Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
- TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
- TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
- Tensor<float, 2, DataLayout> tensor3(3,1);
- typedef Tensor<float, 1>::DimensionPair DimPair;
+ TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+ TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+ Tensor<T, 2, DataLayout> tensor3(3,1);
+ typedef typename Tensor<T, 1>::DimensionPair DimPair;
array<DimPair, 1> contract_along{{DimPair(1, 0)}};
Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
@@ -156,28 +178,28 @@ static void test_slice_in_expr() {
}
// Take an arbitrary slice of an arbitrarily sized tensor.
- TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
- Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+ TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+ Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
for (int i = 0; i < 35; ++i) {
VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
}
}
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_slice_as_lvalue()
{
- Tensor<float, 3, DataLayout> tensor1(2,2,7);
+ Tensor<T, 3, DataLayout> tensor1(2,2,7);
tensor1.setRandom();
- Tensor<float, 3, DataLayout> tensor2(2,2,7);
+ Tensor<T, 3, DataLayout> tensor2(2,2,7);
tensor2.setRandom();
- Tensor<float, 3, DataLayout> tensor3(4,3,5);
+ Tensor<T, 3, DataLayout> tensor3(4,3,5);
tensor3.setRandom();
- Tensor<float, 3, DataLayout> tensor4(4,3,2);
+ Tensor<T, 3, DataLayout> tensor4(4,3,2);
tensor4.setRandom();
- Tensor<float, 3, DataLayout> tensor5(10,13,12);
+ Tensor<T, 3, DataLayout> tensor5(10,13,12);
tensor5.setRandom();
- Tensor<float, 3, DataLayout> result(4,5,7);
+ Tensor<T, 3, DataLayout> result(4,5,7);
Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
result.slice(first_slice, sizes12) = tensor1;
@@ -223,10 +245,10 @@ static void test_slice_as_lvalue()
}
}
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_slice_raw_data()
{
- Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+ Tensor<T, 4, DataLayout> tensor(3,5,7,11);
tensor.setRandom();
Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
@@ -253,7 +275,7 @@ static void test_slice_raw_data()
extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
- VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+ VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0));
if (DataLayout == ColMajor) {
offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
@@ -318,15 +340,15 @@ static void test_slice_raw_data()
}
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_strided_slice()
{
- typedef Tensor<float, 5, DataLayout> Tensor5f;
+ typedef Tensor<T, 5, DataLayout> Tensor5f;
typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
- typedef Tensor<float, 2, DataLayout> Tensor2f;
+ typedef Tensor<T, 2, DataLayout> Tensor2f;
typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
- Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
- Tensor<float, 2, DataLayout> tensor2(7,11);
+ Tensor<T, 5, DataLayout> tensor(2,3,5,7,11);
+ Tensor<T, 2, DataLayout> tensor2(7,11);
tensor.setRandom();
tensor2.setRandom();
@@ -412,13 +434,13 @@ static void test_strided_slice()
}
}
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_strided_slice_write()
{
- typedef Tensor<float, 2, DataLayout> Tensor2f;
+ typedef Tensor<T, 2, DataLayout> Tensor2f;
typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
- Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+ Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11);
tensor.setRandom();
tensor2=tensor;
Tensor2f slice(2,3);
@@ -438,15 +460,14 @@ static void test_strided_slice_write()
}
}
-
-template<int DataLayout>
+template<typename T, int DataLayout>
static void test_composition()
{
- Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+ Eigen::Tensor<T, 2, DataLayout> matrix(7, 11);
matrix.setRandom();
const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
- Eigen::Tensor<float, 3, DataLayout> tensor =
+ Eigen::Tensor<T, 3, DataLayout> tensor =
matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
@@ -458,28 +479,87 @@ static void test_composition()
}
}
+template<typename T, int DataLayout>
+static void test_empty_slice()
+{
+ Tensor<T, 3, DataLayout> tensor(2,3,5);
+ tensor.setRandom();
+ Tensor<T, 3, DataLayout> copy = tensor;
+
+ // empty size in first dimension
+ Eigen::DSizes<ptrdiff_t, 3> indices1(1,2,3);
+ Eigen::DSizes<ptrdiff_t, 3> sizes1(0,1,2);
+ Tensor<T, 3, DataLayout> slice1(0,1,2);
+ slice1.setRandom();
+ tensor.slice(indices1, sizes1) = slice1;
+
+ // empty size in second dimension
+ Eigen::DSizes<ptrdiff_t, 3> indices2(1,2,3);
+ Eigen::DSizes<ptrdiff_t, 3> sizes2(1,0,2);
+ Tensor<T, 3, DataLayout> slice2(1,0,2);
+ slice2.setRandom();
+ tensor.slice(indices2, sizes2) = slice2;
+
+ // empty size in third dimension
+ Eigen::DSizes<ptrdiff_t, 3> indices3(1,2,3);
+ Eigen::DSizes<ptrdiff_t, 3> sizes3(1,1,0);
+ Tensor<T, 3, DataLayout> slice3(1,1,0);
+ slice3.setRandom();
+ tensor.slice(indices3, sizes3) = slice3;
+
+ // empty size in first and second dimension
+ Eigen::DSizes<ptrdiff_t, 3> indices4(1,2,3);
+ Eigen::DSizes<ptrdiff_t, 3> sizes4(0,0,2);
+ Tensor<T, 3, DataLayout> slice4(0,0,2);
+ slice4.setRandom();
+ tensor.slice(indices4, sizes4) = slice4;
+
+ // empty size in second and third dimension
+ Eigen::DSizes<ptrdiff_t, 3> indices5(1,2,3);
+ Eigen::DSizes<ptrdiff_t, 3> sizes5(1,0,0);
+ Tensor<T, 3, DataLayout> slice5(1,0,0);
+ slice5.setRandom();
+ tensor.slice(indices5, sizes5) = slice5;
+
+ // empty size in all dimensions
+ Eigen::DSizes<ptrdiff_t, 3> indices6(1,2,3);
+ Eigen::DSizes<ptrdiff_t, 3> sizes6(0,0,0);
+ Tensor<T, 3, DataLayout> slice6(0,0,0);
+ slice6.setRandom();
+ tensor.slice(indices6, sizes6) = slice6;
+
+ // none of these operations should change the tensor's components
+ // because all of the rvalue slices have at least one zero dimension
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ VERIFY_IS_EQUAL(tensor(i,j,k), copy(i,j,k));
+ }
+ }
+ }
+}
+
+#define CALL_SUBTEST_PART(PART) \
+ CALL_SUBTEST_##PART
+
+#define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME) \
+ CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \
+ CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>()))
-void test_cxx11_tensor_morphing()
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing)
{
CALL_SUBTEST_1(test_simple_reshape<void>());
- CALL_SUBTEST_1(test_reshape_in_expr<void>());
+ CALL_SUBTEST_1(test_static_reshape<void>());
CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
-
- CALL_SUBTEST_1(test_simple_slice<ColMajor>());
- CALL_SUBTEST_1(test_simple_slice<RowMajor>());
- CALL_SUBTEST_1(test_const_slice());
- CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
- CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
- CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
- CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
- CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
- CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
-
- CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
- CALL_SUBTEST_6(test_strided_slice<ColMajor>());
- CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
- CALL_SUBTEST_6(test_strided_slice<RowMajor>());
-
- CALL_SUBTEST_7(test_composition<ColMajor>());
- CALL_SUBTEST_7(test_composition<RowMajor>());
+ CALL_SUBTEST_1(test_reshape_in_expr<void>());
+ CALL_SUBTEST_1(test_const_slice<float>());
+
+ CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice);
+ CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue);
+ CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data);
+ CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write);
+ CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice);
+ CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition);
}
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
new file mode 100644
index 000000000..bf001b40f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
+{
+ typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+ typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+ typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+ typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
+
+ Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+ Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+ Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+ Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
+
+ tensor1.setRandom();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
+ DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+ TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+ TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+ TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
+
+ gpu2.device(sycl_device)=gpu1.reshape(dim2);
+ sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType));
+
+ gpu3.device(sycl_device)=gpu1.reshape(dim3);
+ sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
+
+ gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
+ sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
+ for (IndexType i = 0; i < 2; ++i){
+ for (IndexType j = 0; j < 3; ++j){
+ for (IndexType k = 0; k < 7; ++k){
+ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor
+ if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor
+ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); ///ColMajor
+ }
+ else{
+ //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); /// RowMajor
+ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k)); /// RowMajor
+ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k)); /// RowMajor
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+ sycl_device.deallocate(gpu_data3);
+ sycl_device.deallocate(gpu_data4);
+}
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+ typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+ typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+ typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+ Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+ Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+ Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
+
+ tensor.setRandom();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
+
+ TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+ TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+ TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+
+ gpu2.reshape(dim1).device(sycl_device)=gpu1;
+ sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType));
+
+ gpu3.reshape(dim1).device(sycl_device)=gpu1;
+ sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
+
+
+ for (IndexType i = 0; i < 2; ++i){
+ for (IndexType j = 0; j < 3; ++j){
+ for (IndexType k = 0; k < 7; ++k){
+ VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+ if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
+ VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor
+ }
+ else{
+ VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k)); /// RowMajor
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+ sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
+ tensor.setRandom();
+ array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+ Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+ Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+ Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+ sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+ VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+
+ array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+ Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+ Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+ Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+ gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+ sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 2; ++j) {
+ for (IndexType k = 0; k < 3; ++k) {
+ VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+ sycl_device.deallocate(gpu_data3);
+}
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ IndexType sizeDim5 = 11;
+ typedef Eigen::DSizes<IndexType, 5> Index5;
+ Index5 strides(1L,1L,1L,1L,1L);
+ Index5 indicesStart(1L,2L,3L,4L,5L);
+ Index5 indicesStop(2L,3L,4L,5L,6L);
+ Index5 lengths(1L,1L,1L,1L,1L);
+
+ array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+ Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange);
+ tensor.setRandom();
+
+ array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+ Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
+ Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range);
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
+ DataType* gpu_data_stride2 = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range);
+
+ Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+ Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
+ sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
+
+ gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides);
+ sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType));
+
+ VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+ VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+ array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+ Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
+ Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range);
+
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
+ DataType* gpu_data_stride3 = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+ TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range);
+ Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+ Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
+ Index5 strides2(1L,1L,1L,1L,1L);
+ Index5 indicesStart2(1L,1L,3L,4L,5L);
+ Index5 indicesStop2(2L,2L,5L,6L,8L);
+
+ gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
+ sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
+
+ gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2);
+ sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 2; ++j) {
+ for (IndexType k = 0; k < 3; ++k) {
+ VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+ VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+ sycl_device.deallocate(gpu_data3);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f;
+ typedef Eigen::DSizes<IndexType, 2> Index2;
+ IndexType sizeDim1 = 7L;
+ IndexType sizeDim2 = 11L;
+ array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
+ Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange);
+ IndexType sliceDim1 = 2;
+ IndexType sliceDim2 = 3;
+ array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}};
+ Tensor2f slice(sliceRange);
+ Index2 strides(1L,1L);
+ Index2 indicesStart(3L,4L);
+ Index2 indicesStop(5L,7L);
+ Index2 lengths(2L,3L);
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
+ TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange);
+
+
+ tensor.setRandom();
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1;
+
+ slice.setRandom();
+ sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType));
+
+
+ gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
+ gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
+ sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
+
+ for(IndexType i=0;i<sizeDim1;i++)
+ for(IndexType j=0;j<sizeDim2;j++){
+ VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+ sycl_device.deallocate(gpu_data3);
+}
+
+template <typename OutIndex, typename DSizes>
+Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) {
+ Eigen::array<OutIndex, DSizes::count> out;
+ for (int i = 0; i < DSizes::count; ++i) {
+ out[i] = in[i];
+ }
+ return out;
+}
+
+template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType>
+int run_eigen(const SyclDevice& sycl_device) {
+ using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>;
+ using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>;
+ using TensorMI64 = TensorMap<TensorI64>;
+ using TensorMI32 = TensorMap<TensorI32>;
+ Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}};
+ Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}};
+
+ TensorI64 out_tensor_gpu(tensor_range);
+ TensorI64 out_tensor_cpu(tensor_range);
+ out_tensor_cpu.setRandom();
+
+ TensorI64 sub_tensor(slice_range);
+ sub_tensor.setRandom();
+
+ DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType)));
+ DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType)));
+ TensorMI64 out_gpu(out_gpu_data, tensor_range);
+ TensorMI64 sub_gpu(sub_gpu_data, slice_range);
+
+ sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType));
+
+ Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}};
+ Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}};
+ TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions()));
+ TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions()));
+ TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions()));
+ TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions()));
+
+ out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32;
+
+ out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32;
+
+ sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType));
+ int has_err = 0;
+ for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) {
+ auto exp = out_tensor_cpu(i);
+ auto val = out_tensor_gpu(i);
+ if (val != exp) {
+ std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl;
+ has_err = 1;
+ }
+ }
+ sycl_device.deallocate(out_gpu_data);
+ sycl_device.deallocate(sub_gpu_data);
+ return has_err;
+}
+
+template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+ test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+ test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+ test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
+ test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ run_eigen<float, RowMajor, long, int>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp
new file mode 100644
index 000000000..a2982319f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_move.cpp
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <utility>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void calc_indices(int i, int& x, int& y, int& z)
+{
+ x = i / 4;
+ y = (i % 4) / 2;
+ z = i % 2;
+}
+
+static void test_move()
+{
+ int x;
+ int y;
+ int z;
+
+ Tensor<int,3> tensor1(2, 2, 2);
+ Tensor<int,3,RowMajor> tensor2(2, 2, 2);
+
+ for (int i = 0; i < 8; i++)
+ {
+ calc_indices(i, x, y, z);
+ tensor1(x,y,z) = i;
+ tensor2(x,y,z) = 2 * i;
+ }
+
+ // Invokes the move constructor.
+ Tensor<int,3> moved_tensor1 = std::move(tensor1);
+ Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2);
+
+ VERIFY_IS_EQUAL(tensor1.size(), 0);
+ VERIFY_IS_EQUAL(tensor2.size(), 0);
+
+ for (int i = 0; i < 8; i++)
+ {
+ calc_indices(i, x, y, z);
+ VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i);
+ VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i);
+ }
+
+ Tensor<int,3> moved_tensor3(2,2,2);
+ Tensor<int,3,RowMajor> moved_tensor4(2,2,2);
+
+ moved_tensor3.setZero();
+ moved_tensor4.setZero();
+
+ // Invokes the move assignment operator.
+ moved_tensor3 = std::move(moved_tensor1);
+ moved_tensor4 = std::move(moved_tensor2);
+
+ for (int i = 0; i < 8; i++)
+ {
+ calc_indices(i, x, y, z);
+ VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i);
+ VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i);
+ }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_move)
+{
+ CALL_SUBTEST(test_move());
+}
diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp
index c946007b8..8e8165302 100644
--- a/unsupported/test/cxx11_tensor_notification.cpp
+++ b/unsupported/test/cxx11_tensor_notification.cpp
@@ -9,38 +9,21 @@
#define EIGEN_USE_THREADS
+#include <atomic>
+
#include <stdlib.h>
#include "main.h"
#include <Eigen/CXX11/Tensor>
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#include <windows.h>
-void sleep(int seconds) {
- Sleep(seconds*1000);
-}
-#else
-#include <unistd.h>
-#endif
-
-
-namespace {
-
-void WaitAndAdd(Eigen::Notification* n, int* counter) {
- n->Wait();
- *counter = *counter + 1;
-}
-
-} // namespace
-
static void test_notification_single()
{
ThreadPool thread_pool(1);
- int counter = 0;
+ std::atomic<int> counter(0);
Eigen::Notification n;
- std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+ auto func = [&n, &counter](){ n.Wait(); ++counter;};
thread_pool.Schedule(func);
- sleep(1);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// The thread should be waiting for the notification.
VERIFY_IS_EQUAL(counter, 0);
@@ -48,7 +31,7 @@ static void test_notification_single()
// Unblock the thread
n.Notify();
- sleep(1);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// Verify the counter has been incremented
VERIFY_IS_EQUAL(counter, 1);
@@ -60,21 +43,21 @@ static void test_notification_multiple()
{
ThreadPool thread_pool(1);
- int counter = 0;
+ std::atomic<int> counter(0);
Eigen::Notification n;
- std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+ auto func = [&n, &counter](){ n.Wait(); ++counter;};
thread_pool.Schedule(func);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
- sleep(1);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
VERIFY_IS_EQUAL(counter, 0);
n.Notify();
- sleep(1);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
VERIFY_IS_EQUAL(counter, 4);
}
-void test_cxx11_tensor_notification()
+EIGEN_DECLARE_TEST(cxx11_tensor_notification)
{
CALL_SUBTEST(test_notification_single());
CALL_SUBTEST(test_notification_multiple());
diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp
index e9d1b2d3c..99e18076a 100644
--- a/unsupported/test/cxx11_tensor_of_complex.cpp
+++ b/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -94,7 +94,7 @@ static void test_contractions()
}
-void test_cxx11_tensor_of_complex()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_complex)
{
CALL_SUBTEST(test_additions());
CALL_SUBTEST(test_abs());
diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp
index f179a0c21..344d678ef 100644
--- a/unsupported/test/cxx11_tensor_of_const_values.cpp
+++ b/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -97,7 +97,7 @@ static void test_plus_equal()
}
-void test_cxx11_tensor_of_const_values()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values)
{
CALL_SUBTEST(test_assign());
CALL_SUBTEST(test_plus());
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 2f86980a2..30bcc1d28 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -9,21 +9,19 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+
using Eigen::Tensor;
template<typename>
-void test_cuda_numext() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_numext() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
@@ -59,14 +57,14 @@ void test_cuda_numext() {
}
-#ifdef EIGEN_HAS_CUDA_FP16
+#ifdef EIGEN_HAS_GPU_FP16
template<typename>
-void test_cuda_conversion() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_conversion() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
-
+
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
@@ -97,8 +95,8 @@ void test_cuda_conversion() {
}
template<typename>
-void test_cuda_unary() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_unary() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
@@ -134,8 +132,8 @@ void test_cuda_unary() {
}
template<typename>
-void test_cuda_elementwise() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_elementwise() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
@@ -176,8 +174,8 @@ void test_cuda_elementwise() {
}
template<typename>
-void test_cuda_trancendental() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_trancendental() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
@@ -200,6 +198,8 @@ void test_cuda_trancendental() {
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
@@ -207,6 +207,7 @@ void test_cuda_trancendental() {
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+ gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
@@ -217,6 +218,9 @@ void test_cuda_trancendental() {
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
+ gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+ gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
+
Tensor<float, 1> input1(num_elem);
Tensor<Eigen::half, 1> half_prec1(num_elem);
Tensor<Eigen::half, 1> full_prec1(num_elem);
@@ -243,7 +247,7 @@ void test_cuda_trancendental() {
}
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
- if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+ if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
else
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
@@ -264,8 +268,8 @@ void test_cuda_trancendental() {
}
template<typename>
-void test_cuda_contractions() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_contractions() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int rows = 23;
int cols = 23;
@@ -315,36 +319,32 @@ void test_cuda_contractions() {
}
template<typename>
-void test_cuda_reductions(int size1, int size2, int redux) {
+void test_gpu_reductions(int size1, int size2, int redux) {
std::cout << "Reducing " << size1 << " by " << size2
- << " tensor along dim " << redux << std::endl;
+ << " tensor along dim " << redux << std::endl;
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = size1*size2;
int result_size = (redux == 1 ? size1 : size2);
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, size1, size2);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, size1, size2);
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+ d_float, size1, size2);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, result_size);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, result_size);
- gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
- gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+ gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
- Eigen::array<int, 1> redux_dim = {{redux}};
- gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+ Eigen::array<int, 1> redux_dim = {redux};
+ gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
Tensor<Eigen::half, 1> half_prec(result_size);
Tensor<Eigen::half, 1> full_prec(result_size);
@@ -357,50 +357,45 @@ void test_cuda_reductions(int size1, int size2, int redux) {
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
}
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_float);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
-void test_cuda_reductions() {
- test_cuda_reductions<void>(13, 13, 0);
- test_cuda_reductions<void>(13, 13, 1);
+void test_gpu_reductions() {
+ test_gpu_reductions<void>(13, 13, 0);
+ test_gpu_reductions<void>(13, 13, 1);
- test_cuda_reductions<void>(35, 36, 0);
- test_cuda_reductions<void>(35, 36, 1);
+ test_gpu_reductions<void>(35, 36, 0);
+ test_gpu_reductions<void>(35, 36, 1);
- test_cuda_reductions<void>(36, 35, 0);
- test_cuda_reductions<void>(36, 35, 1);
+ test_gpu_reductions<void>(36, 35, 0);
+ test_gpu_reductions<void>(36, 35, 1);
}
template<typename>
-void test_cuda_full_reductions() {
- Eigen::CudaStreamDevice stream;
+void test_gpu_full_reductions() {
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int size = 13;
int num_elem = size*size;
- float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
- float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+ float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
- d_float1, size, size);
- Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
- d_float2, size, size);
+ Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+ d_float, size, size);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
d_res_half);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
d_res_float);
- gpu_float1.device(gpu_device) = gpu_float1.random();
- gpu_float2.device(gpu_device) = gpu_float2.random();
+ gpu_float.device(gpu_device) = gpu_float.random();
- gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+ gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
Tensor<Eigen::half, 0> half_prec;
Tensor<Eigen::half, 0> full_prec;
@@ -410,24 +405,23 @@ void test_cuda_full_reductions() {
VERIFY_IS_APPROX(full_prec(), half_prec());
- gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
- gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+ gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
+ gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
gpu_device.synchronize();
VERIFY_IS_APPROX(full_prec(), half_prec());
- gpu_device.deallocate(d_float1);
- gpu_device.deallocate(d_float2);
+ gpu_device.deallocate(d_float);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
-void test_cuda_forced_evals() {
+void test_gpu_forced_evals() {
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
@@ -440,7 +434,7 @@ void test_cuda_forced_evals() {
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
d_res_half1, num_elem);
- Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
d_res_half2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
@@ -457,7 +451,7 @@ void test_cuda_forced_evals() {
Tensor<float, 1> half_prec2(num_elem);
Tensor<float, 1> full_prec(num_elem);
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
- gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+ gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
gpu_device.synchronize();
@@ -475,20 +469,20 @@ void test_cuda_forced_evals() {
#endif
-void test_cxx11_tensor_of_float16_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu)
{
- CALL_SUBTEST_1(test_cuda_numext<void>());
-
-#ifdef EIGEN_HAS_CUDA_FP16
- CALL_SUBTEST_1(test_cuda_conversion<void>());
- CALL_SUBTEST_1(test_cuda_unary<void>());
- CALL_SUBTEST_1(test_cuda_elementwise<void>());
- CALL_SUBTEST_1(test_cuda_trancendental<void>());
- CALL_SUBTEST_2(test_cuda_contractions<void>());
- CALL_SUBTEST_3(test_cuda_reductions<void>());
- CALL_SUBTEST_4(test_cuda_full_reductions<void>());
- CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+ CALL_SUBTEST_1(test_gpu_numext<void>());
+
+#ifdef EIGEN_HAS_GPU_FP16
+ CALL_SUBTEST_1(test_gpu_conversion<void>());
+ CALL_SUBTEST_1(test_gpu_unary<void>());
+ CALL_SUBTEST_1(test_gpu_elementwise<void>());
+ CALL_SUBTEST_1(test_gpu_trancendental<void>());
+ CALL_SUBTEST_2(test_gpu_contractions<void>());
+ CALL_SUBTEST_3(test_gpu_reductions<void>());
+ CALL_SUBTEST_4(test_gpu_full_reductions<void>());
+ CALL_SUBTEST_5(test_gpu_forced_evals<void>());
#else
- std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+ std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
#endif
}
diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp
index 4ef9aed91..159656276 100644
--- a/unsupported/test/cxx11_tensor_of_strings.cpp
+++ b/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -141,7 +141,7 @@ static void test_initialization()
}
-void test_cxx11_tensor_of_strings()
+EIGEN_DECLARE_TEST(cxx11_tensor_of_strings)
{
// Beware: none of this is likely to ever work on a GPU.
CALL_SUBTEST(test_assign());
diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp
index ffa19896e..b8a329deb 100644
--- a/unsupported/test/cxx11_tensor_padding.cpp
+++ b/unsupported/test/cxx11_tensor_padding.cpp
@@ -84,7 +84,7 @@ static void test_padded_expr()
}
}
-void test_cxx11_tensor_padding()
+EIGEN_DECLARE_TEST(cxx11_tensor_padding)
{
CALL_SUBTEST(test_simple_padding<ColMajor>());
CALL_SUBTEST(test_simple_padding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
new file mode 100644
index 000000000..727a9ffd7
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -0,0 +1,157 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
+{
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ tensor.setRandom();
+
+ array<std::pair<IndexType, IndexType>, 4> paddings;
+ paddings[0] = std::make_pair(0, 0);
+ paddings[1] = std::make_pair(2, 1);
+ paddings[2] = std::make_pair(3, 4);
+ paddings[3] = std::make_pair(0, 0);
+
+ IndexType padedSizeDim1 = 2;
+ IndexType padedSizeDim2 = 6;
+ IndexType padedSizeDim3 = 12;
+ IndexType padedSizeDim4 = 7;
+ array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}};
+
+ Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange);
+
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange);
+
+ VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+ VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+ VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+ VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1.pad(paddings);
+ sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
+ for (IndexType i = 0; i < padedSizeDim1; ++i) {
+ for (IndexType j = 0; j < padedSizeDim2; ++j) {
+ for (IndexType k = 0; k < padedSizeDim3; ++k) {
+ for (IndexType l = 0; l < padedSizeDim4; ++l) {
+ if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+ VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+ } else {
+ VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, int DataLayout, typename IndexType>
+static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
+{
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ tensor.setRandom();
+
+ array<std::pair<IndexType, IndexType>, 4> paddings;
+ paddings[0] = std::make_pair(0, 0);
+ paddings[1] = std::make_pair(2, 1);
+ paddings[2] = std::make_pair(3, 4);
+ paddings[3] = std::make_pair(0, 0);
+
+ Eigen::DSizes<IndexType, 2> reshape_dims;
+ reshape_dims[0] = 12;
+ reshape_dims[1] = 84;
+
+
+ Tensor<DataType, 2, DataLayout, IndexType> result(reshape_dims);
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims);
+
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
+ gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
+ sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 6; ++j) {
+ for (IndexType k = 0; k < 12; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ const float result_value = DataLayout == ColMajor ?
+ result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+ if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+ VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+ } else {
+ VERIFY_IS_EQUAL(result_value, 0.0f);
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
+ test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
+ test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp
index 434359730..498ab8ca7 100644
--- a/unsupported/test/cxx11_tensor_patch.cpp
+++ b/unsupported/test/cxx11_tensor_patch.cpp
@@ -164,7 +164,7 @@ static void test_simple_patch()
}
}
-void test_cxx11_tensor_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_patch)
{
CALL_SUBTEST(test_simple_patch<ColMajor>());
CALL_SUBTEST(test_simple_patch<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
new file mode 100644
index 000000000..7f92bec78
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){
+
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ array<IndexType, 5> patchTensorRange;
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}};
+ }else{
+ patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}};
+ }
+
+ Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+ Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange);
+
+ tensor.setRandom();
+
+ array<ptrdiff_t, 4> patch_dims;
+ patch_dims[0] = 1;
+ patch_dims[1] = 1;
+ patch_dims[2] = 1;
+ patch_dims[3] = 1;
+
+ const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
+ size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType);
+ DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
+ DataType* gpu_data_no_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+
+ TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
+ gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+ } else {
+ VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+ VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+ VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+ }
+
+ for (int i = 0; i < tensor.size(); ++i) {
+ VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+ }
+
+ patch_dims[0] = 2;
+ patch_dims[1] = 3;
+ patch_dims[2] = 5;
+ patch_dims[3] = 7;
+
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}};
+ }else{
+ patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}};
+ }
+ Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange);
+ patchTensorBuffSize =single_patch.size()*sizeof(DataType);
+ DataType* gpu_data_single_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange);
+
+ gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+ VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+ VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+ VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+ VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+ } else {
+ VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+ VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+ VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+ VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+ }
+
+ for (int i = 0; i < tensor.size(); ++i) {
+ VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+ }
+ patch_dims[0] = 1;
+ patch_dims[1] = 2;
+ patch_dims[2] = 2;
+ patch_dims[3] = 1;
+
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{1,2,2,1,2*2*4*7}};
+ }else{
+ patchTensorRange = {{2*2*4*7, 1, 2,2,1}};
+ }
+ Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange);
+ patchTensorBuffSize =twod_patch.size()*sizeof(DataType);
+ DataType* gpu_data_twod_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange);
+
+ gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+ VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+ } else {
+ VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+ VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+ VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 4; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ int patch_loc;
+ if (DataLayout == ColMajor) {
+ patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+ } else {
+ patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+ }
+ for (int x = 0; x < 2; ++x) {
+ for (int y = 0; y < 2; ++y) {
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ patch_dims[0] = 1;
+ patch_dims[1] = 2;
+ patch_dims[2] = 3;
+ patch_dims[3] = 5;
+
+ if (DataLayout == ColMajor) {
+ patchTensorRange = {{1,2,3,5,2*2*3*3}};
+ }else{
+ patchTensorRange = {{2*2*3*3, 1, 2,3,5}};
+ }
+ Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange);
+ patchTensorBuffSize =threed_patch.size()*sizeof(DataType);
+ DataType* gpu_data_threed_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange);
+
+ gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims);
+ sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize);
+
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+ VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+ VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+ VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+ VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+ } else {
+ VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+ VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+ VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+ VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+ VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 3; ++k) {
+ for (int l = 0; l < 3; ++l) {
+ int patch_loc;
+ if (DataLayout == ColMajor) {
+ patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+ } else {
+ patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+ }
+ for (int x = 0; x < 2; ++x) {
+ for (int y = 0; y < 3; ++y) {
+ for (int z = 0; z < 5; ++z) {
+ if (DataLayout == ColMajor) {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+ } else {
+ VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_tensor);
+ sycl_device.deallocate(gpu_data_no_patch);
+ sycl_device.deallocate(gpu_data_single_patch);
+ sycl_device.deallocate(gpu_data_twod_patch);
+ sycl_device.deallocate(gpu_data_threed_patch);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp
index 0f3dc5787..b9d4c5584 100644
--- a/unsupported/test/cxx11_tensor_random.cpp
+++ b/unsupported/test/cxx11_tensor_random.cpp
@@ -11,9 +11,10 @@
#include <Eigen/CXX11/Tensor>
+template<typename Scalar>
static void test_default()
{
- Tensor<float, 1> vec(6);
+ Tensor<Scalar, 1> vec(6);
vec.setRandom();
// Fixme: we should check that the generated numbers follow a uniform
@@ -23,10 +24,11 @@ static void test_default()
}
}
+template<typename Scalar>
static void test_normal()
{
- Tensor<float, 1> vec(6);
- vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+ Tensor<Scalar, 1> vec(6);
+ vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>();
// Fixme: we should check that the generated numbers follow a gaussian
// distribution instead.
@@ -70,9 +72,15 @@ static void test_custom()
}
}
-void test_cxx11_tensor_random()
+EIGEN_DECLARE_TEST(cxx11_tensor_random)
{
- CALL_SUBTEST(test_default());
- CALL_SUBTEST(test_normal());
+ CALL_SUBTEST((test_default<float>()));
+ CALL_SUBTEST((test_normal<float>()));
+ CALL_SUBTEST((test_default<double>()));
+ CALL_SUBTEST((test_normal<double>()));
+ CALL_SUBTEST((test_default<Eigen::half>()));
+ CALL_SUBTEST((test_normal<Eigen::half>()));
+ CALL_SUBTEST((test_default<Eigen::bfloat16>()));
+ CALL_SUBTEST((test_normal<Eigen::bfloat16>()));
CALL_SUBTEST(test_custom());
}
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_gpu.cu
index b3be199e1..090986ebc 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_gpu.cu
@@ -9,18 +9,16 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <Eigen/CXX11/Tensor>
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
-void test_cuda_random_uniform()
+void test_gpu_random_uniform()
{
Tensor<float, 2> out(72,97);
out.setZero();
@@ -28,24 +26,24 @@ void test_cuda_random_uniform()
std::size_t out_bytes = out.size() * sizeof(float);
float* d_out;
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
gpu_out.device(gpu_device) = gpu_out.random();
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
- // For now we just check thes code doesn't crash.
+ // For now we just check this code doesn't crash.
// TODO: come up with a valid test of randomness
}
-void test_cuda_random_normal()
+void test_gpu_random_normal()
{
Tensor<float, 2> out(72,97);
out.setZero();
@@ -53,9 +51,9 @@ void test_cuda_random_normal()
std::size_t out_bytes = out.size() * sizeof(float);
float* d_out;
- cudaMalloc((void**)(&d_out), out_bytes);
+ gpuMalloc((void**)(&d_out), out_bytes);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@@ -63,8 +61,8 @@ void test_cuda_random_normal()
Eigen::internal::NormalRandomGenerator<float> gen(true);
gpu_out.device(gpu_device) = gpu_out.random(gen);
- assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
- assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+ assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
}
static void test_complex()
@@ -80,9 +78,9 @@ static void test_complex()
}
-void test_cxx11_tensor_random_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu)
{
- CALL_SUBTEST(test_cuda_random_uniform());
- CALL_SUBTEST(test_cuda_random_normal());
+ CALL_SUBTEST(test_gpu_random_uniform());
+ CALL_SUBTEST(test_gpu_random_normal());
CALL_SUBTEST(test_complex());
}
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
new file mode 100644
index 000000000..6c83894a3
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device)
+{
+ Tensor<DataType, 2,DataLayout, IndexType> out(72,97);
+ out.setZero();
+
+ std::size_t out_bytes = out.size() * sizeof(DataType);
+
+ IndexType sizeDim0 = 72;
+ IndexType sizeDim1 = 97;
+
+ array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+ DataType* d_out = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+
+ gpu_out.device(sycl_device)=gpu_out.random();
+ sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+ for(IndexType i=1; i<sizeDim0; i++)
+ for(IndexType j=1; j<sizeDim1; j++)
+ {
+ VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+ VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+ VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1)); }
+
+ // For now we just check thes code doesn't crash.
+ // TODO: come up with a valid test of randomness
+ sycl_device.deallocate(d_out);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device)
+{
+ Tensor<DataType, 2,DataLayout,IndexType> out(72,97);
+ out.setZero();
+ std::size_t out_bytes = out.size() * sizeof(DataType);
+
+ IndexType sizeDim0 = 72;
+ IndexType sizeDim1 = 97;
+
+ array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}};
+
+ DataType* d_out = static_cast<DataType*>(sycl_device.allocate(out_bytes));
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange);
+ Eigen::internal::NormalRandomGenerator<DataType> gen(true);
+ gpu_out.device(sycl_device)=gpu_out.random(gen);
+ sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes);
+ for(IndexType i=1; i<sizeDim0; i++)
+ for(IndexType j=1; j<sizeDim1; j++)
+ {
+ VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j));
+ VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1));
+ VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1));
+
+ }
+
+ // For now we just check thes code doesn't crash.
+ // TODO: come up with a valid test of randomness
+ sycl_device.deallocate(d_out);
+}
+
+template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device);
+
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
+{
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_random_test_per_device<float>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+ CALL_SUBTEST(sycl_random_test_per_device<double>(device));
+#endif
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index 1490ec3da..c46c4c91d 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -53,20 +53,22 @@ static void test_trivial_reductions() {
}
}
-template <int DataLayout>
+template <typename Scalar,int DataLayout>
static void test_simple_reductions() {
- Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+ Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7);
tensor.setRandom();
+ // Add a little offset so that the product reductions won't be close to zero.
+ tensor += tensor.constant(Scalar(0.5f));
array<ptrdiff_t, 2> reduction_axis2;
reduction_axis2[0] = 1;
reduction_axis2[1] = 3;
- Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+ Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2);
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_EQUAL(result.dimension(1), 5);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 5; ++j) {
- float sum = 0.0f;
+ Scalar sum = Scalar(0.0f);
for (int k = 0; k < 3; ++k) {
for (int l = 0; l < 7; ++l) {
sum += tensor(i, k, j, l);
@@ -77,7 +79,7 @@ static void test_simple_reductions() {
}
{
- Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+ Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum();
VERIFY_IS_EQUAL(sum1.rank(), 0);
array<ptrdiff_t, 4> reduction_axis4;
@@ -85,7 +87,7 @@ static void test_simple_reductions() {
reduction_axis4[1] = 1;
reduction_axis4[2] = 2;
reduction_axis4[3] = 3;
- Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+ Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
VERIFY_IS_EQUAL(sum2.rank(), 0);
VERIFY_IS_APPROX(sum1(), sum2());
@@ -98,7 +100,7 @@ static void test_simple_reductions() {
VERIFY_IS_EQUAL(result.dimension(1), 7);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 7; ++j) {
- float prod = 1.0f;
+ Scalar prod = Scalar(1.0f);
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 5; ++l) {
prod *= tensor(k, i, l, j);
@@ -109,7 +111,7 @@ static void test_simple_reductions() {
}
{
- Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+ Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod();
VERIFY_IS_EQUAL(prod1.rank(), 0);
array<ptrdiff_t, 4> reduction_axis4;
@@ -117,7 +119,7 @@ static void test_simple_reductions() {
reduction_axis4[1] = 1;
reduction_axis4[2] = 2;
reduction_axis4[3] = 3;
- Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+ Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
VERIFY_IS_EQUAL(prod2.rank(), 0);
VERIFY_IS_APPROX(prod1(), prod2());
@@ -130,7 +132,7 @@ static void test_simple_reductions() {
VERIFY_IS_EQUAL(result.dimension(1), 7);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 7; ++j) {
- float max_val = std::numeric_limits<float>::lowest();
+ Scalar max_val = std::numeric_limits<Scalar>::lowest();
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 5; ++l) {
max_val = (std::max)(max_val, tensor(k, i, l, j));
@@ -141,7 +143,7 @@ static void test_simple_reductions() {
}
{
- Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+ Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum();
VERIFY_IS_EQUAL(max1.rank(), 0);
array<ptrdiff_t, 4> reduction_axis4;
@@ -149,7 +151,7 @@ static void test_simple_reductions() {
reduction_axis4[1] = 1;
reduction_axis4[2] = 2;
reduction_axis4[3] = 3;
- Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+ Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
VERIFY_IS_EQUAL(max2.rank(), 0);
VERIFY_IS_APPROX(max1(), max2());
@@ -162,7 +164,7 @@ static void test_simple_reductions() {
VERIFY_IS_EQUAL(result.dimension(1), 7);
for (int i = 0; i < 5; ++i) {
for (int j = 0; j < 7; ++j) {
- float min_val = (std::numeric_limits<float>::max)();
+ Scalar min_val = (std::numeric_limits<Scalar>::max)();
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 3; ++l) {
min_val = (std::min)(min_val, tensor(k, l, i, j));
@@ -173,7 +175,7 @@ static void test_simple_reductions() {
}
{
- Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+ Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum();
VERIFY_IS_EQUAL(min1.rank(), 0);
array<ptrdiff_t, 4> reduction_axis4;
@@ -181,7 +183,7 @@ static void test_simple_reductions() {
reduction_axis4[1] = 1;
reduction_axis4[2] = 2;
reduction_axis4[3] = 3;
- Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+ Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
VERIFY_IS_EQUAL(min2.rank(), 0);
VERIFY_IS_APPROX(min1(), min2());
@@ -194,7 +196,7 @@ static void test_simple_reductions() {
VERIFY_IS_EQUAL(result.dimension(1), 7);
for (int i = 0; i < 5; ++i) {
for (int j = 0; j < 7; ++j) {
- float sum = 0.0f;
+ Scalar sum = Scalar(0.0f);
int count = 0;
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 3; ++l) {
@@ -202,12 +204,12 @@ static void test_simple_reductions() {
++count;
}
}
- VERIFY_IS_APPROX(result(i, j), sum / count);
+ VERIFY_IS_APPROX(result(i, j), sum / Scalar(count));
}
}
{
- Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+ Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean();
VERIFY_IS_EQUAL(mean1.rank(), 0);
array<ptrdiff_t, 4> reduction_axis4;
@@ -215,7 +217,7 @@ static void test_simple_reductions() {
reduction_axis4[1] = 1;
reduction_axis4[2] = 2;
reduction_axis4[3] = 3;
- Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+ Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
VERIFY_IS_EQUAL(mean2.rank(), 0);
VERIFY_IS_APPROX(mean1(), mean2());
@@ -225,11 +227,11 @@ static void test_simple_reductions() {
Tensor<int, 1> ints(10);
std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
- TensorFixedSize<bool, Sizes<> > all;
- all = ints.all();
- VERIFY(!all());
- all = (ints >= ints.constant(0)).all();
- VERIFY(all());
+ TensorFixedSize<bool, Sizes<> > all_;
+ all_ = ints.all();
+ VERIFY(!all_());
+ all_ = (ints >= ints.constant(0)).all();
+ VERIFY(all_());
TensorFixedSize<bool, Sizes<> > any;
any = (ints > ints.constant(10)).any();
@@ -368,7 +370,7 @@ static void test_static_dims() {
Tensor<float, 2, DataLayout> out(72, 97);
in.setRandom();
-#if !EIGEN_HAS_CONSTEXPR
+#if !EIGEN_HAS_CONSTEXPR
array<int, 2> reduction_axis;
reduction_axis[0] = 1;
reduction_axis[1] = 3;
@@ -386,7 +388,7 @@ static void test_static_dims() {
expected = (std::max)(expected, in(i, k, j, l));
}
}
- VERIFY_IS_APPROX(out(i, j), expected);
+ VERIFY_IS_EQUAL(out(i, j), expected);
}
}
}
@@ -417,7 +419,7 @@ static void test_innermost_last_dims() {
expected = (std::max)(expected, in(l, k, i, j));
}
}
- VERIFY_IS_APPROX(out(i, j), expected);
+ VERIFY_IS_EQUAL(out(i, j), expected);
}
}
}
@@ -448,7 +450,7 @@ static void test_innermost_first_dims() {
expected = (std::max)(expected, in(i, j, k, l));
}
}
- VERIFY_IS_APPROX(out(i, j), expected);
+ VERIFY_IS_EQUAL(out(i, j), expected);
}
}
}
@@ -479,16 +481,37 @@ static void test_reduce_middle_dims() {
expected = (std::max)(expected, in(i, k, l, j));
}
}
- VERIFY_IS_APPROX(out(i, j), expected);
+ VERIFY_IS_EQUAL(out(i, j), expected);
+ }
+ }
+}
+
+static void test_sum_accuracy() {
+ Tensor<float, 3> tensor(101, 101, 101);
+ for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) {
+ tensor.setRandom();
+ tensor += tensor.constant(prescribed_mean);
+
+ Tensor<float, 0> sum = tensor.sum();
+ double expected_sum = 0.0;
+ for (int i = 0; i < 101; ++i) {
+ for (int j = 0; j < 101; ++j) {
+ for (int k = 0; k < 101; ++k) {
+ expected_sum += static_cast<double>(tensor(i, j, k));
+ }
+ }
}
+ VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum));
}
}
-void test_cxx11_tensor_reduction() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction) {
CALL_SUBTEST(test_trivial_reductions<ColMajor>());
CALL_SUBTEST(test_trivial_reductions<RowMajor>());
- CALL_SUBTEST(test_simple_reductions<ColMajor>());
- CALL_SUBTEST(test_simple_reductions<RowMajor>());
+ CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() ));
+ CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() ));
+ CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() ));
+ CALL_SUBTEST(( test_simple_reductions<Eigen::bfloat16,ColMajor>() ));
CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
CALL_SUBTEST(test_full_reductions<ColMajor>());
@@ -505,4 +528,5 @@ void test_cxx11_tensor_reduction() {
CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+ CALL_SUBTEST(test_sum_accuracy());
}
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu
index 6858b43a7..122ac946b 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu
@@ -9,12 +9,9 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
@@ -22,7 +19,7 @@
template<typename Type, int DataLayout>
static void test_full_reductions() {
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024);
@@ -70,7 +67,7 @@ static void test_first_dim_reductions() {
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice dev(&stream);
// Create data(T)
@@ -110,7 +107,7 @@ static void test_last_dim_reductions() {
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice dev(&stream);
// Create data
@@ -137,7 +134,7 @@ static void test_last_dim_reductions() {
}
-void test_cxx11_tensor_reduction_cuda() {
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) {
CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index a9ef82907..a297716e4 100644
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -13,38 +13,168 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
+#define EIGEN_HAS_CONSTEXPR 1
#include "main.h"
+
#include <unsupported/Eigen/CXX11/Tensor>
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ const IndexType num_rows = 753;
+ const IndexType num_cols = 537;
+ array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+ array<IndexType, 2> outRange = {{1, 1}};
-static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
+ Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
- const int num_rows = 452;
- const int num_cols = 765;
- array<int, 2> tensorRange = {{num_rows, num_cols}};
+ in.setRandom();
+ auto dim = DSizes<IndexType, 2>(1, 1);
+ full_redux = in.sum().reshape(dim);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = (DataType*)sycl_device.allocate(
+ sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
+
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data,
+ outRange);
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
+ sycl_device.memcpyDeviceToHost(
+ full_redux_gpu.data(), gpu_out_data,
+ (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ std::cout << "SYCL FULL :" << full_redux_gpu(0, 0)
+ << ", CPU FULL: " << full_redux(0, 0) << "\n";
+ VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_sum_with_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+ using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+ const IndexType num_rows = 64;
+ const IndexType num_cols = 64;
+ array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+ const IndexType n_elems = internal::array_prod(tensor_range);
- Tensor<float, 2> in(tensorRange);
- Tensor<float, 0> full_redux;
- Tensor<float, 0> full_redux_gpu;
+ data_tensor in(tensor_range);
+ scalar_tensor full_redux;
+ scalar_tensor full_redux_gpu;
in.setRandom();
+ array<IndexType, 2> tensor_offset_range(tensor_range);
+ tensor_offset_range[0] -= 1;
+
+ const IndexType offset = 64;
+ TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+ full_redux = in_offset.sum();
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data =
+ static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+ TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.sum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
- full_redux = in.sum();
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
- float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
- float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
- TensorMap<Tensor<float, 2> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<float, 0> > out_gpu(gpu_out_data);
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ const IndexType num_rows = 4096;
+ const IndexType num_cols = 4096;
+ array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+ in.setRandom();
+
+ full_redux = in.maximum();
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.maximum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
+ VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_max_with_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+ using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+ const IndexType num_rows = 64;
+ const IndexType num_cols = 64;
+ array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+ const IndexType n_elems = internal::array_prod(tensor_range);
+
+ data_tensor in(tensor_range);
+ scalar_tensor full_redux;
+ scalar_tensor full_redux_gpu;
+
+ in.setRandom();
+ array<IndexType, 2> tensor_offset_range(tensor_range);
+ tensor_offset_range[0] -= 1;
+ // Set the initial value to be the max.
+ // As we don't include this in the reduction the result should not be 2.
+ in(0) = static_cast<DataType>(2);
+
+ const IndexType offset = 64;
+ TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+ full_redux = in_offset.maximum();
+ VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data =
+ static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+ TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.maximum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
- out_gpu.device(sycl_device) = in_gpu.sum();
- sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
@@ -52,87 +182,833 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
sycl_device.deallocate(gpu_out_data);
}
-static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ const IndexType num_rows = 4096;
+ const IndexType num_cols = 4096;
+ array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+ array<IndexType, 1> argRange = {{num_cols}};
+ Eigen::array<IndexType, 1> red_axis;
+ red_axis[0] = 0;
+ // red_axis[1]=1;
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
+ Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
+ Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
+ Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+ in.setRandom();
+ in_arg1.setRandom();
+ in_arg2.setRandom();
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate(
+ in_arg1.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate(
+ in_arg2.dimensions().TotalSize() * sizeof(DataType)));
+ bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate(
+ out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+ bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate(
+ out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+ DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(
+ gpu_in_arg1_data, tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(
+ gpu_in_arg2_data, tensorRange);
+ TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(
+ gpu_out_arg_data, argRange);
+ TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(
+ gpu_out_arg__gpu_helper_data, argRange);
+ TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+ // CPU VERSION
+ out_arg_cpu =
+ (in_arg1.argmax(1) == in_arg2.argmax(1))
+ .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
+ full_redux = (out_arg_cpu.template cast<float>())
+ .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+
+ // GPU VERSION
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(
+ gpu_in_arg1_data, in_arg1.data(),
+ (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(
+ gpu_in_arg2_data, in_arg2.data(),
+ (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
+ out_Argout_gpu_helper.device(sycl_device) =
+ (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
+ out_Argout_gpu.device(sycl_device) =
+ (out_Argout_gpu_helper)
+ .select(out_Argout_gpu.constant(true),
+ out_Argout_gpu.constant(false));
+ out_gpu.device(sycl_device) =
+ (out_Argout_gpu.template cast<float>())
+ .reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux()
+ << '\n';
+ VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_in_arg1_data);
+ sycl_device.deallocate(gpu_in_arg2_data);
+ sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
+ sycl_device.deallocate(gpu_out_arg_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+ using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+ const IndexType num_rows = 64;
+ const IndexType num_cols = 64;
+ array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+ const IndexType n_elems = internal::array_prod(tensor_range);
+
+ data_tensor in(tensor_range);
+ scalar_tensor full_redux;
+ scalar_tensor full_redux_gpu;
+
+ in.setRandom();
+ array<IndexType, 2> tensor_offset_range(tensor_range);
+ tensor_offset_range[0] -= 1;
+
+ const IndexType offset = 64;
+ TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+ full_redux = in_offset.mean();
+ VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data =
+ static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+ TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.mean();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
+
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_mean_with_odd_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ // This is a particular case which illustrates a possible problem when the
+ // number of local threads in a workgroup is even, but is not a power of two.
+ using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+ using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+ // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
+ // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
+ const IndexType n_elems = 8707;
+ array<IndexType, 1> tensor_range = {{n_elems}};
+
+ data_tensor in(tensor_range);
+ DataType full_redux;
+ DataType full_redux_gpu;
+ TensorMap<scalar_tensor> red_cpu(&full_redux);
+ TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
+
+ const DataType const_val = static_cast<DataType>(0.6391);
+ in = in.constant(const_val);
+
+ Eigen::IndexList<Eigen::type2index<0>> red_axis;
+ red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+ VERIFY_IS_APPROX(const_val, red_cpu());
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data =
+ static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
+ TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) =
+ in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
+ sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data,
+ sizeof(DataType));
+
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux_gpu, full_redux);
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ const IndexType num_rows = 876;
+ const IndexType num_cols = 953;
+ array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+ Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+
+ in.setRandom();
+
+ full_redux = in.minimum();
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
+
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.minimum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_full_reductions_min_with_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+ using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
+ const IndexType num_rows = 64;
+ const IndexType num_cols = 64;
+ array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+ const IndexType n_elems = internal::array_prod(tensor_range);
+
+ data_tensor in(tensor_range);
+ scalar_tensor full_redux;
+ scalar_tensor full_redux_gpu;
+
+ in.setRandom();
+ array<IndexType, 2> tensor_offset_range(tensor_range);
+ tensor_offset_range[0] -= 1;
+ // Set the initial value to be the min.
+ // As we don't include this in the reduction the result should not be -2.
+ in(0) = static_cast<DataType>(-2);
+
+ const IndexType offset = 64;
+ TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+ full_redux = in_offset.minimum();
+ VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data =
+ static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+ TensorMap<scalar_tensor> out_gpu(gpu_out_data);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.minimum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
+ sizeof(DataType));
+
+ // Check that the CPU and GPU reductions return the same result.
+ VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ IndexType dim_x = 145;
+ IndexType dim_y = 1;
+ IndexType dim_z = 67;
+
+ array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<IndexType, 1> red_axis;
+ red_axis[0] = 0;
+ array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+ in.setRandom();
+
+ redux = in.maximum(red_axis);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+ gpu_out_data, reduced_tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu.data(), gpu_out_data,
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+ for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+ VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_max_with_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+ using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+ const IndexType num_rows = 64;
+ const IndexType num_cols = 64;
+ array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+ array<IndexType, 1> reduced_range = {{num_cols}};
+ const IndexType n_elems = internal::array_prod(tensor_range);
+ const IndexType n_reduced = num_cols;
- int dim_x = 145;
- int dim_y = 1;
- int dim_z = 67;
+ data_tensor in(tensor_range);
+ reduced_tensor redux;
+ reduced_tensor redux_gpu(reduced_range);
- array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
- Eigen::array<int, 1> red_axis;
+ in.setRandom();
+ array<IndexType, 2> tensor_offset_range(tensor_range);
+ tensor_offset_range[0] -= 1;
+ // Set maximum value outside of the considered range.
+ for (IndexType i = 0; i < n_reduced; i++) {
+ in(i) = static_cast<DataType>(2);
+ }
+
+ Eigen::array<IndexType, 1> red_axis;
red_axis[0] = 0;
- array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
- Tensor<float, 3> in(tensorRange);
- Tensor<float, 2> redux(reduced_tensorRange);
- Tensor<float, 2> redux_gpu(reduced_tensorRange);
+ const IndexType offset = 64;
+ TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+ redux = in_offset.maximum(red_axis);
+ for (IndexType i = 0; i < n_reduced; i++) {
+ VERIFY_IS_NOT_EQUAL(redux(i), in(i));
+ }
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(
+ sycl_device.allocate(n_reduced * sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+ TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+ sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data,
+ n_reduced * sizeof(DataType));
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType i = 0; i < n_reduced; i++) {
+ VERIFY_IS_APPROX(redux_gpu(i), redux(i));
+ }
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_max_with_offset_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
+ using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
+
+ const IndexType num_rows = 64;
+ const IndexType num_cols = 64;
+ array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
+ array<IndexType, 1> full_reduced_range = {{num_rows}};
+ array<IndexType, 1> reduced_range = {{num_rows - 1}};
+ const IndexType n_elems = internal::array_prod(tensor_range);
+ const IndexType n_reduced = reduced_range[0];
+
+ data_tensor in(tensor_range);
+ reduced_tensor redux(full_reduced_range);
+ reduced_tensor redux_gpu(reduced_range);
in.setRandom();
+ redux.setZero();
+ array<IndexType, 2> tensor_offset_range(tensor_range);
+ tensor_offset_range[0] -= 1;
+ // Set maximum value outside of the considered range.
+ for (IndexType i = 0; i < n_reduced; i++) {
+ in(i) = static_cast<DataType>(2);
+ }
+
+ Eigen::array<IndexType, 1> red_axis;
+ red_axis[0] = 1;
+
+ const IndexType offset = 64;
+ // Introduce an offset in both the input and the output.
+ TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
+ TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
+ red_offset = in_offset.maximum(red_axis);
+
+ // Check that the first value hasn't been changed and that the reduced values
+ // are not equal to the previously set maximum in the input outside the range.
+ VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
+ for (IndexType i = 0; i < n_reduced; i++) {
+ VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
+ }
+
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(
+ sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
+
+ TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
+ TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),
+ n_elems * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
+ sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(),
+ n_reduced * sizeof(DataType));
- redux= in.sum(red_axis);
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType i = 0; i < n_reduced; i++) {
+ VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
+ }
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
- float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
- float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_sum_sycl(
+ const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) {
+ array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
+ Eigen::array<IndexType, 1> red_axis;
+ red_axis[0] = 0;
+ array<IndexType, 1> reduced_tensorRange = {{dim_y}};
- TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+ in.setRandom();
+ redux = in.sum(red_axis);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(
+ gpu_out_data, reduced_tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
- sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu.data(), gpu_out_data,
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.
- for(int j=0; j<reduced_tensorRange[0]; j++ )
- for(int k=0; k<reduced_tensorRange[1]; k++ )
- VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+ for (IndexType i = 0; i < redux.size(); i++) {
+ VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
+ }
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_first_dim_reductions_mean_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ IndexType dim_x = 145;
+ IndexType dim_y = 1;
+ IndexType dim_z = 67;
+
+ array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<IndexType, 1> red_axis;
+ red_axis[0] = 0;
+ array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+ in.setRandom();
+
+ redux = in.mean(red_axis);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+ gpu_out_data, reduced_tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu.data(), gpu_out_data,
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+ for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+ VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
-static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_mean_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ IndexType dim_x = 64;
+ IndexType dim_y = 1;
+ IndexType dim_z = 32;
+
+ array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<IndexType, 1> red_axis;
+ red_axis[0] = 2;
+ array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+
+ in.setRandom();
+
+ redux = in.mean(red_axis);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
- int dim_x = 567;
- int dim_y = 1;
- int dim_z = 47;
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+ gpu_out_data, reduced_tensorRange);
- array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
- Eigen::array<int, 1> red_axis;
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu.data(), gpu_out_data,
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+ for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+ VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_dim_reductions_sum_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ IndexType dim_x = 64;
+ IndexType dim_y = 1;
+ IndexType dim_z = 32;
+
+ array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<IndexType, 1> red_axis;
red_axis[0] = 2;
- array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+ array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
- Tensor<float, 3> in(tensorRange);
- Tensor<float, 2> redux(reduced_tensorRange);
- Tensor<float, 2> redux_gpu(reduced_tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+ Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
in.setRandom();
- redux= in.sum(red_axis);
+ redux = in.sum(red_axis);
- float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
- float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
- TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(
+ gpu_out_data, reduced_tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
- sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu.data(), gpu_out_data,
+ redux_gpu.dimensions().TotalSize() * sizeof(DataType));
// Check that the CPU and GPU reductions return the same result.
- for(int j=0; j<reduced_tensorRange[0]; j++ )
- for(int k=0; k<reduced_tensorRange[1]; k++ )
- VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+ for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
+ for (IndexType k = 0; k < reduced_tensorRange[1]; k++)
+ VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_sum_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ auto tensorRange = Sizes<64, 32>(64, 32);
+ // auto red_axis = Sizes<0,1>(0,1);
+ Eigen::IndexList<Eigen::type2index<1>> red_axis;
+ auto reduced_tensorRange = Sizes<64>(64);
+ TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+ TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+ TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+ in_fix.setRandom();
+
+ redux_fix = in_fix.sum(red_axis);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+ gpu_in_data, tensorRange);
+ TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+ gpu_out_data, reduced_tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in_fix.data(),
+ (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu_fix.data(), gpu_out_data,
+ redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+ VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+ }
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_last_reductions_mean_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ auto tensorRange = Sizes<64, 32>(64, 32);
+ Eigen::IndexList<Eigen::type2index<1>> red_axis;
+ auto reduced_tensorRange = Sizes<64>(64);
+ TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
+ TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
+ TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
+
+ in_fix.setRandom();
+ redux_fix = in_fix.mean(red_axis);
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(
+ gpu_in_data, tensorRange);
+ TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(
+ gpu_out_data, reduced_tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, in_fix.data(),
+ (in_fix.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
+ sycl_device.memcpyDeviceToHost(
+ redux_gpu_fix.data(), gpu_out_data,
+ redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
+ sycl_device.synchronize();
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
+ VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
+ }
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+// SYCL supports a generic case of reduction where the accumulator is a
+// different type than the input data This is an example on how to get if a
+// Tensor contains nan and/or inf in one reduction
+template <typename InT, typename OutT>
+struct CustomReducer {
+ static const bool PacketAccess = false;
+ static const bool IsStateful = false;
+
+ static constexpr OutT InfBit = 1;
+ static constexpr OutT NanBit = 2;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x,
+ OutT* accum) const {
+ if (Eigen::numext::isinf(x))
+ *accum |= InfBit;
+ else if (Eigen::numext::isnan(x))
+ *accum |= NanBit;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x,
+ OutT* accum) const {
+ *accum |= x;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const {
+ return OutT(0);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const {
+ return accum;
+ }
+};
+
+template <typename DataType, typename AccumType, int DataLayout,
+ typename IndexType>
+static void test_full_reductions_custom_sycl(
+ const Eigen::SyclDevice& sycl_device) {
+ constexpr IndexType InSize = 64;
+ auto tensorRange = Sizes<InSize>(InSize);
+ Eigen::IndexList<Eigen::type2index<0>> dims;
+ auto reduced_tensorRange = Sizes<>();
+ TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
+ TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
+
+ CustomReducer<DataType, AccumType> reducer;
+
+ in_fix.setRandom();
+
+ size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
+ DataType* gpu_in_data =
+ static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
+ AccumType* gpu_out_data =
+ static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
+
+ TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(
+ gpu_in_data, tensorRange);
+ TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(
+ gpu_out_data, reduced_tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
+ out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
+ sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
+ sizeof(AccumType));
+ VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_full_per_device(const Dev& sycl_device) {
+ test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+
+ test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(
+ sycl_device);
+ sycl_device.synchronize();
}
-void test_cxx11_tensor_reduction_sycl() {
- cl::sycl::gpu_selector s;
- Eigen::SyclDevice sycl_device(s);
- CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
- CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
- CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+template <typename DataType, typename Dev>
+void sycl_reduction_full_offset_per_device(const Dev& sycl_device) {
+ test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(
+ sycl_device);
+ test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) {
+ test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device,
+ 4197, 4097);
+ test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+ 4197, 4097);
+ test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device,
+ 129, 8);
+ test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ sycl_device.synchronize();
+}
+
+template <typename DataType, typename Dev>
+void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) {
+ test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(
+ sycl_device);
+ test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
+ test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ sycl_device.synchronize();
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) {
+ for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ std::cout << "Running on "
+ << device.template get_info<cl::sycl::info::device::name>()
+ << std::endl;
+ QueueInterface queueInterface(device);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
+ CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
+ CALL_SUBTEST_3(
+ sycl_reduction_test_first_dim_per_device<float>(sycl_device));
+ CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
+ }
}
diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp
index c8f105e3d..7dbd0478c 100644
--- a/unsupported/test/cxx11_tensor_ref.cpp
+++ b/unsupported/test/cxx11_tensor_ref.cpp
@@ -235,7 +235,7 @@ static void test_nested_ops_with_ref()
}
-void test_cxx11_tensor_ref()
+EIGEN_DECLARE_TEST(cxx11_tensor_ref)
{
CALL_SUBTEST(test_simple_lvalue_ref());
CALL_SUBTEST(test_simple_rvalue_ref());
diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp
index b35b8d29e..5e44ec007 100644
--- a/unsupported/test/cxx11_tensor_reverse.cpp
+++ b/unsupported/test/cxx11_tensor_reverse.cpp
@@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue)
}
-void test_cxx11_tensor_reverse()
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse)
{
CALL_SUBTEST(test_simple_reverse<ColMajor>());
CALL_SUBTEST(test_simple_reverse<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
new file mode 100644
index 000000000..dd30c235d
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
+ IndexType dim1 = 2;
+ IndexType dim2 = 3;
+ IndexType dim3 = 5;
+ IndexType dim4 = 7;
+
+ array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
+ tensor.setRandom();
+
+ array<bool, 4> dim_rev;
+ dim_rev[0] = false;
+ dim_rev[1] = true;
+ dim_rev[2] = true;
+ dim_rev[3] = false;
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(
+ reversed_tensor.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data,
+ tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, tensor.data(),
+ (tensor.dimensions().TotalSize()) * sizeof(DataType));
+ out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+ sycl_device.memcpyDeviceToHost(
+ reversed_tensor.data(), gpu_out_data,
+ reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+ // Check that the CPU and GPU reductions return the same result.
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i, j, k, l),
+ reversed_tensor(i, 2 - j, 4 - k, l));
+ }
+ }
+ }
+ }
+ dim_rev[0] = true;
+ dim_rev[1] = false;
+ dim_rev[2] = false;
+ dim_rev[3] = false;
+
+ out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+ sycl_device.memcpyDeviceToHost(
+ reversed_tensor.data(), gpu_out_data,
+ reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l));
+ }
+ }
+ }
+ }
+
+ dim_rev[0] = true;
+ dim_rev[1] = false;
+ dim_rev[2] = false;
+ dim_rev[3] = true;
+ out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
+ sycl_device.memcpyDeviceToHost(
+ reversed_tensor.data(), gpu_out_data,
+ reversed_tensor.dimensions().TotalSize() * sizeof(DataType));
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i, j, k, l),
+ reversed_tensor(1 - i, j, k, 6 - l));
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_expr_reverse(const Eigen::SyclDevice& sycl_device,
+ bool LValue) {
+ IndexType dim1 = 2;
+ IndexType dim2 = 3;
+ IndexType dim3 = 5;
+ IndexType dim4 = 7;
+
+ array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
+ tensor.setRandom();
+
+ array<bool, 4> dim_rev;
+ dim_rev[0] = false;
+ dim_rev[1] = true;
+ dim_rev[2] = false;
+ dim_rev[3] = true;
+
+ DataType* gpu_in_data = static_cast<DataType*>(
+ sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate(
+ expected.dimensions().TotalSize() * sizeof(DataType)));
+ DataType* gpu_out_data_result = static_cast<DataType*>(
+ sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data,
+ tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(
+ gpu_out_data_expected, tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(
+ gpu_out_data_result, tensorRange);
+
+ sycl_device.memcpyHostToDevice(
+ gpu_in_data, tensor.data(),
+ (tensor.dimensions().TotalSize()) * sizeof(DataType));
+
+ if (LValue) {
+ out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
+ } else {
+ out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
+ }
+ sycl_device.memcpyDeviceToHost(
+ expected.data(), gpu_out_data_expected,
+ expected.dimensions().TotalSize() * sizeof(DataType));
+
+ array<IndexType, 4> src_slice_dim;
+ src_slice_dim[0] = 2;
+ src_slice_dim[1] = 3;
+ src_slice_dim[2] = 1;
+ src_slice_dim[3] = 7;
+ array<IndexType, 4> src_slice_start;
+ src_slice_start[0] = 0;
+ src_slice_start[1] = 0;
+ src_slice_start[2] = 0;
+ src_slice_start[3] = 0;
+ array<IndexType, 4> dst_slice_dim = src_slice_dim;
+ array<IndexType, 4> dst_slice_start = src_slice_start;
+
+ for (IndexType i = 0; i < 5; ++i) {
+ if (LValue) {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+ .reverse(dim_rev)
+ .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim);
+ } else {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+ in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+ }
+ src_slice_start[2] += 1;
+ dst_slice_start[2] += 1;
+ }
+ sycl_device.memcpyDeviceToHost(
+ result.data(), gpu_out_data_result,
+ result.dimensions().TotalSize() * sizeof(DataType));
+
+ for (IndexType i = 0; i < expected.dimension(0); ++i) {
+ for (IndexType j = 0; j < expected.dimension(1); ++j) {
+ for (IndexType k = 0; k < expected.dimension(2); ++k) {
+ for (IndexType l = 0; l < expected.dimension(3); ++l) {
+ VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+ }
+ }
+ }
+ }
+
+ dst_slice_start[2] = 0;
+ result.setRandom();
+ sycl_device.memcpyHostToDevice(
+ gpu_out_data_result, result.data(),
+ (result.dimensions().TotalSize()) * sizeof(DataType));
+ for (IndexType i = 0; i < 5; ++i) {
+ if (LValue) {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim)
+ .reverse(dim_rev)
+ .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim);
+ } else {
+ out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
+ in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+ }
+ dst_slice_start[2] += 1;
+ }
+ sycl_device.memcpyDeviceToHost(
+ result.data(), gpu_out_data_result,
+ result.dimensions().TotalSize() * sizeof(DataType));
+
+ for (IndexType i = 0; i < expected.dimension(0); ++i) {
+ for (IndexType j = 0; j < expected.dimension(1); ++j) {
+ for (IndexType k = 0; k < expected.dimension(2); ++k) {
+ for (IndexType l = 0; l < expected.dimension(3); ++l) {
+ VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l));
+ }
+ }
+ }
+ }
+}
+
+template <typename DataType>
+void sycl_reverse_test_per_device(const cl::sycl::device& d) {
+ QueueInterface queueInterface(d);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+ test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+ test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+ test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+ test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
+ for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ std::cout << "Running on "
+ << device.get_info<cl::sycl::info::device::name>() << std::endl;
+ CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device));
+ CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device));
+ CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device));
+#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
+ CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
+#endif
+ CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp
index 2c26151ab..83b592384 100644
--- a/unsupported/test/cxx11_tensor_roundings.cpp
+++ b/unsupported/test/cxx11_tensor_roundings.cpp
@@ -54,7 +54,7 @@ static void test_float_ceiling()
}
}
-void test_cxx11_tensor_roundings()
+EIGEN_DECLARE_TEST(cxx11_tensor_roundings)
{
CALL_SUBTEST(test_float_rounding());
CALL_SUBTEST(test_float_ceiling());
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index af59aa3ef..dccee9e84 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -98,7 +98,7 @@ static void test_tensor_maps() {
}
}
-void test_cxx11_tensor_scan() {
+EIGEN_DECLARE_TEST(cxx11_tensor_scan) {
CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu
index 5f146f3c9..770a144f1 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_gpu.cu
@@ -9,21 +9,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
-#include <cuda_fp16.h>
-#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
+
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
-void test_cuda_cumsum(int m_size, int k_size, int n_size)
+void test_gpu_cumsum(int m_size, int k_size, int n_size)
{
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@@ -38,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
float* d_t_input;
float* d_t_result;
- cudaMalloc((void**)(&d_t_input), t_input_bytes);
- cudaMalloc((void**)(&d_t_result), t_result_bytes);
+ gpuMalloc((void**)(&d_t_input), t_input_bytes);
+ gpuMalloc((void**)(&d_t_result), t_result_bytes);
- cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@@ -54,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
t_result = t_input.cumsum(1);
- cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue;
@@ -67,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
assert(false);
}
- cudaFree((void*)d_t_input);
- cudaFree((void*)d_t_result);
+ gpuFree((void*)d_t_input);
+ gpuFree((void*)d_t_result);
}
-void test_cxx11_tensor_scan_cuda()
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu)
{
- CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
- CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+ CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
+ CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
}
diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp
new file mode 100644
index 000000000..09c45fce5
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size,
+ IndexType k_size, IndexType n_size, int consume_dim,
+ bool exclusive) {
+ static const DataType error_threshold = 1e-4f;
+ std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size
+ << " consume_dim : " << consume_dim << ")" << std::endl;
+ Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size);
+ Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size);
+ Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size,
+ n_size);
+
+ t_input.setRandom();
+ std::size_t t_input_bytes = t_input.size() * sizeof(DataType);
+ std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
+
+ DataType* gpu_data_in =
+ static_cast<DataType*>(sycl_device.allocate(t_input_bytes));
+ DataType* gpu_data_out =
+ static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
+
+ array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}};
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input(
+ gpu_data_in, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result(
+ gpu_data_out, tensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes);
+ sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes);
+
+ gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive);
+
+ t_result = t_input.cumsum(consume_dim, exclusive);
+
+ sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out,
+ t_result_bytes);
+ sycl_device.synchronize();
+
+ for (IndexType i = 0; i < t_result.size(); i++) {
+ if (static_cast<DataType>(std::fabs(static_cast<DataType>(
+ t_result(i) - t_result_gpu(i)))) < error_threshold) {
+ continue;
+ }
+ if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i),
+ error_threshold)) {
+ continue;
+ }
+ std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i)
+ << " vs SYCL : " << t_result_gpu(i) << std::endl;
+ assert(false);
+ }
+ sycl_device.deallocate(gpu_data_in);
+ sycl_device.deallocate(gpu_data_out);
+}
+
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) {
+ test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+ true);
+ test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+ true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) {
+ test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+ true);
+ test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+ true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) {
+ test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+ true);
+ test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+ true);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) {
+ test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+ false);
+ test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0,
+ false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) {
+ test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+ false);
+ test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1,
+ false);
+}
+template <typename DataType, typename Dev>
+void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) {
+ test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+ false);
+ test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2,
+ false);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) {
+ for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ std::cout << "Running on "
+ << device.template get_info<cl::sycl::info::device::name>()
+ << std::endl;
+ QueueInterface queueInterface(device);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ CALL_SUBTEST_1(
+ sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device));
+ CALL_SUBTEST_2(
+ sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device));
+ CALL_SUBTEST_3(
+ sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device));
+ CALL_SUBTEST_4(
+ sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device));
+ CALL_SUBTEST_5(
+ sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device));
+ CALL_SUBTEST_6(
+ sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index d11444a14..89a64c021 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -81,12 +81,12 @@ static void test_expr_shuffling()
Tensor<float, 4, DataLayout> expected;
expected = tensor.shuffle(shuffles);
- Tensor<float, 4, DataLayout> result(5,7,3,2);
+ Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
- array<int, 4> src_slice_dim{{2,3,1,7}};
- array<int, 4> src_slice_start{{0,0,0,0}};
- array<int, 4> dst_slice_dim{{1,7,3,2}};
- array<int, 4> dst_slice_start{{0,0,0,0}};
+ array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}};
+ array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}};
+ array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}};
+ array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}};
for (int i = 0; i < 5; ++i) {
result.slice(dst_slice_start, dst_slice_dim) =
@@ -215,7 +215,60 @@ static void test_shuffle_unshuffle()
}
-void test_cxx11_tensor_shuffling()
+template <int DataLayout>
+static void test_empty_shuffling()
+{
+ Tensor<float, 4, DataLayout> tensor(2,3,0,7);
+ tensor.setRandom();
+ array<ptrdiff_t, 4> shuffles;
+ shuffles[0] = 0;
+ shuffles[1] = 1;
+ shuffles[2] = 2;
+ shuffles[3] = 3;
+
+ Tensor<float, 4, DataLayout> no_shuffle;
+ no_shuffle = tensor.shuffle(shuffles);
+
+ VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+ VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+ VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0);
+ VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 0; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+ }
+ }
+ }
+ }
+
+ shuffles[0] = 2;
+ shuffles[1] = 3;
+ shuffles[2] = 1;
+ shuffles[3] = 0;
+ Tensor<float, 4, DataLayout> shuffle;
+ shuffle = tensor.shuffle(shuffles);
+
+ VERIFY_IS_EQUAL(shuffle.dimension(0), 0);
+ VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+ VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+ VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 0; ++k) {
+ for (int l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+ }
+ }
+ }
+ }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
{
CALL_SUBTEST(test_simple_shuffling<ColMajor>());
CALL_SUBTEST(test_simple_shuffling<RowMajor>());
@@ -225,4 +278,6 @@ void test_cxx11_tensor_shuffling()
CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+ CALL_SUBTEST(test_empty_shuffling<ColMajor>());
+ CALL_SUBTEST(test_empty_shuffling<RowMajor>());
}
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
new file mode 100644
index 000000000..ca4e8b5ef
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) {
+ IndexType sizeDim1 = 2;
+ IndexType sizeDim2 = 3;
+ IndexType sizeDim3 = 5;
+ IndexType sizeDim4 = 7;
+ array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+ Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange);
+ tensor.setRandom();
+
+ const size_t buffSize = tensor.size() * sizeof(DataType);
+ array<IndexType, 4> shuffles;
+ shuffles[0] = 0;
+ shuffles[1] = 1;
+ shuffles[2] = 2;
+ shuffles[3] = 3;
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1,
+ tensorRange);
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2,
+ tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
+
+ gpu2.device(sycl_device) = gpu1.shuffle(shuffles);
+ sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
+ sycl_device.synchronize();
+
+ VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1);
+ VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2);
+ VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
+ VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim4; ++l) {
+ VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l));
+ }
+ }
+ }
+ }
+
+ shuffles[0] = 2;
+ shuffles[1] = 3;
+ shuffles[2] = 1;
+ shuffles[3] = 0;
+ array<IndexType, 4> tensorrangeShuffle = {
+ {sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+ Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle);
+ DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
+ TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3(
+ gpu_data3, tensorrangeShuffle);
+
+ gpu3.device(sycl_device) = gpu1.shuffle(shuffles);
+ sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
+ sycl_device.synchronize();
+
+ VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3);
+ VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4);
+ VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
+ VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
+ for (IndexType l = 0; l < sizeDim4; ++l) {
+ VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i));
+ }
+ }
+ }
+ }
+}
+
+template <typename DataType, typename dev_Selector>
+void sycl_shuffling_test_per_device(dev_Selector s) {
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
+ test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
+ for (const auto& device : Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp
index 5a0d339ef..6d70f5435 100644
--- a/unsupported/test/cxx11_tensor_simple.cpp
+++ b/unsupported/test/cxx11_tensor_simple.cpp
@@ -316,7 +316,7 @@ static void test_resize()
VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
}
-void test_cxx11_tensor_simple()
+EIGEN_DECLARE_TEST(cxx11_tensor_simple)
{
CALL_SUBTEST(test_0d());
CALL_SUBTEST(test_1d());
diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp
index 935b908cc..aefdfa9b4 100644
--- a/unsupported/test/cxx11_tensor_striding.cpp
+++ b/unsupported/test/cxx11_tensor_striding.cpp
@@ -110,7 +110,7 @@ static void test_striding_as_lvalue()
}
-void test_cxx11_tensor_striding()
+EIGEN_DECLARE_TEST(cxx11_tensor_striding)
{
CALL_SUBTEST(test_simple_striding<ColMajor>());
CALL_SUBTEST(test_simple_striding<RowMajor>());
diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp
new file mode 100644
index 000000000..d3b1fa77c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include <iostream>
+#include <chrono>
+#include <ctime>
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
+{
+
+ Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+ Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
+
+
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+ std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
+ std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+ std::size_t stride_bytes = stride.size() * sizeof(DataType);
+ DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+ DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+ DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+
+ tensor.setRandom();
+ array<IndexType, 4> strides;
+ strides[0] = 1;
+ strides[1] = 1;
+ strides[2] = 1;
+ strides[3] = 1;
+ sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+ gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
+ sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+ //no_stride = tensor.stride(strides);
+
+ VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+ VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+ VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+ VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+ }
+ }
+ }
+ }
+
+ strides[0] = 2;
+ strides[1] = 4;
+ strides[2] = 2;
+ strides[3] = 3;
+//Tensor<float, 4, DataLayout> stride;
+// stride = tensor.stride(strides);
+
+ gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
+ sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+ VERIFY_IS_EQUAL(stride.dimension(0), 1);
+ VERIFY_IS_EQUAL(stride.dimension(1), 1);
+ VERIFY_IS_EQUAL(stride.dimension(2), 3);
+ VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+ for (IndexType i = 0; i < 1; ++i) {
+ for (IndexType j = 0; j < 1; ++j) {
+ for (IndexType k = 0; k < 3; ++k) {
+ for (IndexType l = 0; l < 3; ++l) {
+ VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+ }
+ }
+ }
+ }
+
+ sycl_device.deallocate(d_tensor);
+ sycl_device.deallocate(d_no_stride);
+ sycl_device.deallocate(d_stride);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
+{
+
+ Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
+ Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
+
+
+ Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
+ Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
+
+
+ std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
+ std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
+ std::size_t stride_bytes = stride.size() * sizeof(DataType);
+
+ DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
+ DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
+ DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
+
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
+ Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
+
+ //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+ tensor.setRandom();
+ array<IndexType, 4> strides;
+ strides[0] = 2;
+ strides[1] = 4;
+ strides[2] = 2;
+ strides[3] = 3;
+
+// Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+// result.stride(strides) = tensor;
+ sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
+ gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
+ sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
+ }
+ }
+ }
+ }
+
+ array<IndexType, 4> no_strides;
+ no_strides[0] = 1;
+ no_strides[1] = 1;
+ no_strides[2] = 1;
+ no_strides[3] = 1;
+// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+// result2.stride(strides) = tensor.stride(no_strides);
+
+ gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
+ sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
+
+ for (IndexType i = 0; i < 2; ++i) {
+ for (IndexType j = 0; j < 3; ++j) {
+ for (IndexType k = 0; k < 5; ++k) {
+ for (IndexType l = 0; l < 7; ++l) {
+ VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(d_tensor);
+ sycl_device.deallocate(d_no_stride);
+ sycl_device.deallocate(d_stride);
+}
+
+
+template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
+ QueueInterface queueInterface(s);
+ auto sycl_device=Eigen::SyclDevice(&queueInterface);
+ test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+ test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+ test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+ test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(tensorStridingPerDevice(device));
+ }
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index 2f56eb495..2ca5c47db 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() {
}
}
-void test_cxx11_tensor_sugar()
+EIGEN_DECLARE_TEST(cxx11_tensor_sugar)
{
CALL_SUBTEST(test_comparison_sugar());
CALL_SUBTEST(test_scalar_sugar_add_mul());
diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp
index 6a9c33422..e6c5e2378 100644
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@@ -15,8 +15,8 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#define EIGEN_USE_SYCL
#include "main.h"
@@ -27,36 +27,188 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
+ IndexType sizeDim1 = 5;
+ IndexType sizeDim2 = 5;
+ IndexType sizeDim3 = 1;
+ array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+ Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
+
+ in1 = in1.random();
+
+ DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+ DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
+ gpu1.device(sycl_device) = gpu1 * 3.14f;
+ gpu2.device(sycl_device) = gpu2 * 2.7f;
+ sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
+ sycl_device.synchronize();
+
+ for (IndexType i = 0; i < in1.size(); ++i) {
+ // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n";
+ VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
+ VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
+ VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
+ }
+
+ sycl_device.deallocate(gpu_data1);
+ sycl_device.deallocate(gpu_data2);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
+ IndexType size = 20;
+ array<IndexType, 1> tensorRange = {{size}};
+ Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+ Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+ Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
+
+ in1 = in1.random();
+ in2 = in1;
+
+ DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+
+ TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
+ sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
+ sycl_device.synchronize();
+ in1.setZero();
+
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
+ sycl_device.synchronize();
+
+ for (IndexType i = 0; i < in1.size(); ++i) {
+ VERIFY_IS_APPROX(out(i), in2(i));
+ }
+
+ sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) {
+ using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+ IndexType full_size = 32;
+ IndexType half_size = full_size / 2;
+ array<IndexType, 1> tensorRange = {{full_size}};
+ tensor_type in1(tensorRange);
+ tensor_type out(tensorRange);
+
+ DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+ TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+ in1 = in1.random();
+ // Copy all data to device, then permute on copy back to host
+ sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
+
+ for (IndexType i = 0; i < half_size; ++i) {
+ VERIFY_IS_APPROX(out(i), in1(i + half_size));
+ VERIFY_IS_APPROX(out(i + half_size), in1(i));
+ }
+
+ in1 = in1.random();
+ out.setZero();
+ // Permute copies to device, then copy all back to host
+ sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+ for (IndexType i = 0; i < half_size; ++i) {
+ VERIFY_IS_APPROX(out(i), in1(i + half_size));
+ VERIFY_IS_APPROX(out(i + half_size), in1(i));
+ }
+
+ in1 = in1.random();
+ out.setZero();
+ DataType* gpu_data_out = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+ TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
+ // Copy all to device, permute copies on device, then copy all back to host
+ sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
+ sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
+ sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
+
+ for (IndexType i = 0; i < half_size; ++i) {
+ VERIFY_IS_APPROX(out(i), in1(i + half_size));
+ VERIFY_IS_APPROX(out(i + half_size), in1(i));
+ }
+
+ sycl_device.deallocate(gpu_data_out);
+ sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) {
+ using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
+ IndexType full_size = 32;
+ IndexType half_size = full_size / 2;
+ array<IndexType, 1> tensorRange = {{full_size}};
+ tensor_type cpu_out(tensorRange);
+ tensor_type out(tensorRange);
+
+ cpu_out.setZero();
+
+ std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
+ std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
+
+ DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
+ TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
+
+ sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
+ sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
+
+ for (IndexType i = 0; i < full_size; ++i) {
+ VERIFY_IS_APPROX(out(i), cpu_out(i));
+ }
+
+ sycl_device.deallocate(gpu_data);
+}
+
+template <typename DataType, int DataLayout, typename IndexType>
+void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
- int sizeDim1 = 100;
- int sizeDim2 = 100;
- int sizeDim3 = 100;
- array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Tensor<float, 3> in1(tensorRange);
- Tensor<float, 3> in2(tensorRange);
- Tensor<float, 3> in3(tensorRange);
- Tensor<float, 3> out(tensorRange);
+ IndexType sizeDim1 = 100;
+ IndexType sizeDim2 = 10;
+ IndexType sizeDim3 = 20;
+ array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+ Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+ Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+ Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
in2 = in2.random();
in3 = in3.random();
- float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
- float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
- float * gpu_in3_data = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
- float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+ DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
+ DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
+ DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
+ DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
- TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
- TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
- TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
- TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+ TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
/// a=1.2f
gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
- sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
+ sycl_device.synchronize();
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
}
}
@@ -65,10 +217,12 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
/// a=b*1.2f
gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
+ sycl_device.synchronize();
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) * 1.2f);
}
@@ -77,12 +231,14 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
printf("a=b*1.2f Test Passed\n");
/// c=a*b
- sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+ sycl_device.synchronize();
+
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) *
in2(i,j,k));
@@ -93,10 +249,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
/// c=a+b
gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+ sycl_device.synchronize();
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) +
in2(i,j,k));
@@ -107,10 +264,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
/// c=a*a
gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+ sycl_device.synchronize();
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) *
in1(i,j,k));
@@ -121,10 +279,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
//a*3.14f + b*2.7f
gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
- sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
+ sycl_device.synchronize();
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) * 3.14f
+ in2(i,j,k) * 2.7f);
@@ -134,12 +293,13 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
printf("a*3.14f + b*2.7f Test Passed\n");
///d= (a>0.5? b:c)
- sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
+ sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
+ sycl_device.synchronize();
+ for (IndexType i = 0; i < sizeDim1; ++i) {
+ for (IndexType j = 0; j < sizeDim2; ++j) {
+ for (IndexType k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
? in2(i, j, k)
: in3(i, j, k));
@@ -152,8 +312,50 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
sycl_device.deallocate(gpu_in3_data);
sycl_device.deallocate(gpu_out_data);
}
-void test_cxx11_tensor_sycl() {
- cl::sycl::gpu_selector s;
- Eigen::SyclDevice sycl_device(s);
- CALL_SUBTEST(test_sycl_cpu(sycl_device));
+template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType>
+static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
+ IndexType size = 20;
+ array<IndexType, 1> tensorRange = {{size}};
+ Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+ Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+ Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
+
+ in = in.random();
+
+ Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
+ Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
+
+ TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+ TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
+ gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
+ out_host = in. template cast<Scalar2>();
+ for(IndexType i=0; i< size; i++)
+ {
+ VERIFY_IS_APPROX(out(i), out_host(i));
+ }
+ printf("cast Test Passed\n");
+ sycl_device.deallocate(gpu_in_data);
+ sycl_device.deallocate(gpu_out_data);
+}
+template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
+ QueueInterface queueInterface(s);
+ auto sycl_device = Eigen::SyclDevice(&queueInterface);
+ test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
+ test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+ test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+ test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_sycl) {
+ for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
+ }
}
diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp
index d680e9b3b..fed269a9a 100644
--- a/unsupported/test/cxx11_tensor_symmetry.cpp
+++ b/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -801,7 +801,7 @@ static void test_tensor_randacc()
}
}
-void test_cxx11_tensor_symmetry()
+EIGEN_DECLARE_TEST(cxx11_tensor_symmetry)
{
CALL_SUBTEST(test_symgroups_static());
CALL_SUBTEST(test_symgroups_dynamic());
diff --git a/unsupported/test/cxx11_tensor_thread_local.cpp b/unsupported/test/cxx11_tensor_thread_local.cpp
new file mode 100644
index 000000000..7e866f6d1
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_thread_local.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <iostream>
+#include <unordered_set>
+
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+struct Counter {
+ Counter() = default;
+
+ void inc() {
+ // Check that mutation happens only in a thread that created this counter.
+ VERIFY_IS_EQUAL(std::this_thread::get_id(), created_by);
+ counter_value++;
+ }
+ int value() { return counter_value; }
+
+ std::thread::id created_by;
+ int counter_value = 0;
+};
+
+struct InitCounter {
+ void operator()(Counter& counter) {
+ counter.created_by = std::this_thread::get_id();
+ }
+};
+
+void test_simple_thread_local() {
+ int num_threads = internal::random<int>(4, 32);
+ Eigen::ThreadPool thread_pool(num_threads);
+ Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+ int num_tasks = 3 * num_threads;
+ Eigen::Barrier barrier(num_tasks);
+
+ for (int i = 0; i < num_tasks; ++i) {
+ thread_pool.Schedule([&counter, &barrier]() {
+ Counter& local = counter.local();
+ local.inc();
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ barrier.Notify();
+ });
+ }
+
+ barrier.Wait();
+
+ counter.ForEach(
+ [](std::thread::id, Counter& cnt) { VERIFY_IS_EQUAL(cnt.value(), 3); });
+}
+
+void test_zero_sized_thread_local() {
+ Eigen::ThreadLocal<Counter, InitCounter> counter(0, InitCounter());
+
+ Counter& local = counter.local();
+ local.inc();
+
+ int total = 0;
+ counter.ForEach([&total](std::thread::id, Counter& cnt) {
+ total += cnt.value();
+ VERIFY_IS_EQUAL(cnt.value(), 1);
+ });
+
+ VERIFY_IS_EQUAL(total, 1);
+}
+
+// All thread local values fits into the lock-free storage.
+void test_large_number_of_tasks_no_spill() {
+ int num_threads = internal::random<int>(4, 32);
+ Eigen::ThreadPool thread_pool(num_threads);
+ Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter());
+
+ int num_tasks = 10000;
+ Eigen::Barrier barrier(num_tasks);
+
+ for (int i = 0; i < num_tasks; ++i) {
+ thread_pool.Schedule([&counter, &barrier]() {
+ Counter& local = counter.local();
+ local.inc();
+ barrier.Notify();
+ });
+ }
+
+ barrier.Wait();
+
+ int total = 0;
+ std::unordered_set<std::thread::id> unique_threads;
+
+ counter.ForEach([&](std::thread::id id, Counter& cnt) {
+ total += cnt.value();
+ unique_threads.insert(id);
+ });
+
+ VERIFY_IS_EQUAL(total, num_tasks);
+ // Not all threads in a pool might be woken up to execute submitted tasks.
+ // Also thread_pool.Schedule() might use current thread if queue is full.
+ VERIFY_IS_EQUAL(
+ unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+// Lock free thread local storage is too small to fit all the unique threads,
+// and it spills to a map guarded by a mutex.
+void test_large_number_of_tasks_with_spill() {
+ int num_threads = internal::random<int>(4, 32);
+ Eigen::ThreadPool thread_pool(num_threads);
+ Eigen::ThreadLocal<Counter, InitCounter> counter(1, InitCounter());
+
+ int num_tasks = 10000;
+ Eigen::Barrier barrier(num_tasks);
+
+ for (int i = 0; i < num_tasks; ++i) {
+ thread_pool.Schedule([&counter, &barrier]() {
+ Counter& local = counter.local();
+ local.inc();
+ barrier.Notify();
+ });
+ }
+
+ barrier.Wait();
+
+ int total = 0;
+ std::unordered_set<std::thread::id> unique_threads;
+
+ counter.ForEach([&](std::thread::id id, Counter& cnt) {
+ total += cnt.value();
+ unique_threads.insert(id);
+ });
+
+ VERIFY_IS_EQUAL(total, num_tasks);
+ // Not all threads in a pool might be woken up to execute submitted tasks.
+ // Also thread_pool.Schedule() might use current thread if queue is full.
+ VERIFY_IS_EQUAL(
+ unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true);
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_local) {
+ CALL_SUBTEST(test_simple_thread_local());
+ CALL_SUBTEST(test_zero_sized_thread_local());
+ CALL_SUBTEST(test_large_number_of_tasks_no_spill());
+ CALL_SUBTEST(test_large_number_of_tasks_with_spill());
+}
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2ef665f30..b772a1d60 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -16,29 +16,72 @@
using Eigen::Tensor;
+class TestAllocator : public Allocator {
+ public:
+ ~TestAllocator() EIGEN_OVERRIDE {}
+ EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
+ const_cast<TestAllocator*>(this)->alloc_count_++;
+ return internal::aligned_malloc(num_bytes);
+ }
+ EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
+ const_cast<TestAllocator*>(this)->dealloc_count_++;
+ internal::aligned_free(buffer);
+ }
+
+ int alloc_count() const { return alloc_count_; }
+ int dealloc_count() const { return dealloc_count_; }
+
+ private:
+ int alloc_count_ = 0;
+ int dealloc_count_ = 0;
+};
void test_multithread_elementwise()
{
- Tensor<float, 3> in1(2,3,7);
- Tensor<float, 3> in2(2,3,7);
- Tensor<float, 3> out(2,3,7);
+ Tensor<float, 3> in1(200, 30, 70);
+ Tensor<float, 3> in2(200, 30, 70);
+ Tensor<double, 3> out(200, 30, 70);
in1.setRandom();
in2.setRandom();
Eigen::ThreadPool tp(internal::random<int>(3, 11));
Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
- out.device(thread_pool_device) = in1 + in2 * 3.14f;
+ out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 7; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+ for (int i = 0; i < 200; ++i) {
+ for (int j = 0; j < 30; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
}
}
}
}
+void test_async_multithread_elementwise()
+{
+ Tensor<float, 3> in1(200, 30, 70);
+ Tensor<float, 3> in2(200, 30, 70);
+ Tensor<double, 3> out(200, 30, 70);
+
+ in1.setRandom();
+ in2.setRandom();
+
+ Eigen::ThreadPool tp(internal::random<int>(3, 11));
+ Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+ Eigen::Barrier b(1);
+ out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
+ b.Wait();
+
+ for (int i = 0; i < 200; ++i) {
+ for (int j = 0; j < 30; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+ }
+ }
+ }
+}
void test_multithread_compound_assignment()
{
@@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() {
}
}
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+ template <typename Index, typename Scalar>
+ EIGEN_ALWAYS_INLINE void operator()(
+ const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+ const TensorContractionParams&, Index, Index, Index num_rows,
+ Index num_cols) const {
+ for (int i = 0; i < num_rows; ++i) {
+ for (int j = 0; j < num_cols; ++j) {
+ output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+ }
+ }
+ }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(2, 11);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+ Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+ Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in mat4 to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 1500, 248);
+ MapXf m_right(t_right.data(), 248, 1400);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+ // compute results by separate methods
+ t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
+
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+ int contract_size = internal::random<int>(100, 500);
+
+ Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+ contract_size,
+ internal::random<int>(10, 40));
+
+ Tensor<float, 4, DataLayout> right(
+ internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+ internal::random<int>(1, 20));
+
+ left.setRandom();
+ right.setRandom();
+
+ // add constants to shift values away from 0 for more precision
+ left += left.constant(1.5f);
+ right += right.constant(1.5f);
+
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+ Eigen::ThreadPool tp(internal::random<int>(2, 11));
+ Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+ Tensor<float, 5, DataLayout> st_result;
+ st_result = left.contract(right, dims);
+
+ Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+ Eigen::Barrier barrier(1);
+ tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+ left.contract(right, dims);
+ barrier.Wait();
+
+ VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+ for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+ // if both of the values are very small, then do nothing (because the test
+ // will fail due to numerical precision issues when values are small)
+ if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+ VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+ }
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ t_result.device(device) = t_left.contract(t_right, dims);
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ Eigen::Barrier barrier(1);
+ t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+ t_left.contract(t_right, dims);
+ barrier.Wait();
+
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ Eigen::Barrier barrier(1);
+ t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+ t_left.contract(t_right, dims, SqrtOutputKernel());
+ barrier.Wait();
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
template<int DataLayout>
void test_full_contraction() {
@@ -320,14 +630,14 @@ void test_multithread_random()
}
template<int DataLayout>
-void test_multithread_shuffle()
+void test_multithread_shuffle(Allocator* allocator)
{
Tensor<float, 4, DataLayout> tensor(17,5,7,11);
tensor.setRandom();
const int num_threads = internal::random<int>(2, 11);
ThreadPool threads(num_threads);
- Eigen::ThreadPoolDevice device(&threads, num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
@@ -344,10 +654,26 @@ void test_multithread_shuffle()
}
}
+void test_threadpool_allocate(TestAllocator* allocator)
+{
+ const int num_threads = internal::random<int>(2, 11);
+ const int num_allocs = internal::random<int>(2, 11);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+ for (int a = 0; a < num_allocs; ++a) {
+ void* ptr = device.allocate(512);
+ device.deallocate(ptr);
+ }
+ VERIFY(allocator != NULL);
+ VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
+ VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
+}
-void test_cxx11_tensor_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
{
CALL_SUBTEST_1(test_multithread_elementwise());
+ CALL_SUBTEST_1(test_async_multithread_elementwise());
CALL_SUBTEST_1(test_multithread_compound_assignment());
CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
@@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool()
CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+ CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+
+ CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+ CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+ // Test EvalShardedByInnerDimContext parallelization strategy.
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
// Exercise various cases that have been problematic in the past.
- CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
- CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+ CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+ CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
+
+ CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+ CALL_SUBTEST_8(test_full_contraction<RowMajor>());
+
+ CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+ CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
- CALL_SUBTEST_4(test_full_contraction<ColMajor>());
- CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+ CALL_SUBTEST_10(test_memcpy());
+ CALL_SUBTEST_10(test_multithread_random());
- CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
- CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+ TestAllocator test_allocator;
+ CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+ CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+ CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
- CALL_SUBTEST_6(test_memcpy());
- CALL_SUBTEST_6(test_multithread_random());
- CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
- CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+ // Force CMake to split this test.
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
}
diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp
new file mode 100644
index 000000000..009722895
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_trace.cpp
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_0D_trace() {
+ Tensor<float, 0, DataLayout> tensor;
+ tensor.setRandom();
+ array<ptrdiff_t, 0> dims;
+ Tensor<float, 0, DataLayout> result = tensor.trace(dims);
+ VERIFY_IS_EQUAL(result(), tensor());
+}
+
+
+template <int DataLayout>
+static void test_all_dimensions_trace() {
+ Tensor<float, 3, DataLayout> tensor1(5, 5, 5);
+ tensor1.setRandom();
+ Tensor<float, 0, DataLayout> result1 = tensor1.trace();
+ VERIFY_IS_EQUAL(result1.rank(), 0);
+ float sum = 0.0f;
+ for (int i = 0; i < 5; ++i) {
+ sum += tensor1(i, i, i);
+ }
+ VERIFY_IS_EQUAL(result1(), sum);
+
+ Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7);
+ tensor2.setRandom();
+ array<ptrdiff_t, 5> dims = { { 2, 1, 0, 3, 4 } };
+ Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims);
+ VERIFY_IS_EQUAL(result2.rank(), 0);
+ sum = 0.0f;
+ for (int i = 0; i < 7; ++i) {
+ sum += tensor2(i, i, i, i, i);
+ }
+ VERIFY_IS_EQUAL(result2(), sum);
+}
+
+
+template <int DataLayout>
+static void test_simple_trace() {
+ Tensor<float, 3, DataLayout> tensor1(3, 5, 3);
+ tensor1.setRandom();
+ array<ptrdiff_t, 2> dims1 = { { 0, 2 } };
+ Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1);
+ VERIFY_IS_EQUAL(result1.rank(), 1);
+ VERIFY_IS_EQUAL(result1.dimension(0), 5);
+ float sum = 0.0f;
+ for (int i = 0; i < 5; ++i) {
+ sum = 0.0f;
+ for (int j = 0; j < 3; ++j) {
+ sum += tensor1(j, i, j);
+ }
+ VERIFY_IS_EQUAL(result1(i), sum);
+ }
+
+ Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7);
+ tensor2.setRandom();
+ array<ptrdiff_t, 2> dims2 = { { 2, 3 } };
+ Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2);
+ VERIFY_IS_EQUAL(result2.rank(), 2);
+ VERIFY_IS_EQUAL(result2.dimension(0), 5);
+ VERIFY_IS_EQUAL(result2.dimension(1), 5);
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 7; ++k) {
+ sum += tensor2(i, j, k, k);
+ }
+ VERIFY_IS_EQUAL(result2(i, j), sum);
+ }
+ }
+
+ array<ptrdiff_t, 2> dims3 = { { 1, 0 } };
+ Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3);
+ VERIFY_IS_EQUAL(result3.rank(), 2);
+ VERIFY_IS_EQUAL(result3.dimension(0), 7);
+ VERIFY_IS_EQUAL(result3.dimension(1), 7);
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 5; ++k) {
+ sum += tensor2(k, k, i, j);
+ }
+ VERIFY_IS_EQUAL(result3(i, j), sum);
+ }
+ }
+
+ Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3);
+ tensor3.setRandom();
+ array<ptrdiff_t, 3> dims4 = { { 0, 2, 4 } };
+ Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4);
+ VERIFY_IS_EQUAL(result4.rank(), 2);
+ VERIFY_IS_EQUAL(result4.dimension(0), 7);
+ VERIFY_IS_EQUAL(result4.dimension(1), 7);
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 3; ++k) {
+ sum += tensor3(k, i, k, j, k);
+ }
+ VERIFY_IS_EQUAL(result4(i, j), sum);
+ }
+ }
+
+ Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5);
+ tensor4.setRandom();
+ array<ptrdiff_t, 2> dims5 = { { 1, 3 } };
+ Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5);
+ VERIFY_IS_EQUAL(result5.rank(), 3);
+ VERIFY_IS_EQUAL(result5.dimension(0), 3);
+ VERIFY_IS_EQUAL(result5.dimension(1), 4);
+ VERIFY_IS_EQUAL(result5.dimension(2), 5);
+ for (int i = 0; i < 3; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ sum = 0.0f;
+ for (int l = 0; l < 7; ++l) {
+ sum += tensor4(i, l, j, l, k);
+ }
+ VERIFY_IS_EQUAL(result5(i, j, k), sum);
+ }
+ }
+ }
+}
+
+
+template<int DataLayout>
+static void test_trace_in_expr() {
+ Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3);
+ tensor.setRandom();
+ array<ptrdiff_t, 2> dims = { { 1, 3 } };
+ Tensor<float, 2, DataLayout> result(2, 5);
+ result = result.constant(1.0f) - tensor.trace(dims);
+ VERIFY_IS_EQUAL(result.rank(), 2);
+ VERIFY_IS_EQUAL(result.dimension(0), 2);
+ VERIFY_IS_EQUAL(result.dimension(1), 5);
+ float sum = 0.0f;
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ sum = 0.0f;
+ for (int k = 0; k < 3; ++k) {
+ sum += tensor(i, k, j, k);
+ }
+ VERIFY_IS_EQUAL(result(i, j), 1.0f - sum);
+ }
+ }
+}
+
+
+EIGEN_DECLARE_TEST(cxx11_tensor_trace) {
+ CALL_SUBTEST(test_0D_trace<ColMajor>());
+ CALL_SUBTEST(test_0D_trace<RowMajor>());
+ CALL_SUBTEST(test_all_dimensions_trace<ColMajor>());
+ CALL_SUBTEST(test_all_dimensions_trace<RowMajor>());
+ CALL_SUBTEST(test_simple_trace<ColMajor>());
+ CALL_SUBTEST(test_simple_trace<RowMajor>());
+ CALL_SUBTEST(test_trace_in_expr<ColMajor>());
+ CALL_SUBTEST(test_trace_in_expr<RowMajor>());
+}
diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp
index d2a1e8673..46fceaa19 100644
--- a/unsupported/test/cxx11_tensor_uint128.cpp
+++ b/unsupported/test/cxx11_tensor_uint128.cpp
@@ -12,7 +12,7 @@
#include <Eigen/CXX11/Tensor>
-#if EIGEN_COMP_MSVC
+#if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__)
#define EIGEN_NO_INT128
#else
typedef __uint128_t uint128_t;
@@ -144,7 +144,7 @@ void test_misc2() {
#endif
-void test_cxx11_tensor_uint128()
+EIGEN_DECLARE_TEST(cxx11_tensor_uint128)
{
#ifdef EIGEN_NO_INT128
// Skip the test on compilers that don't support 128bit integers natively
diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp
index ca6840f3b..862212e82 100644
--- a/unsupported/test/cxx11_tensor_volume_patch.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -70,9 +70,9 @@ static void test_entire_volume_patch()
const int dy = patch_y - 1;
const int dx = patch_x - 1;
- const int forward_pad_z = dz - dz / 2;
- const int forward_pad_y = dy - dy / 2;
- const int forward_pad_x = dx - dx / 2;
+ const int forward_pad_z = dz / 2;
+ const int forward_pad_y = dy / 2;
+ const int forward_pad_x = dx / 2;
for (int pz = 0; pz < patch_z; pz++) {
for (int py = 0; py < patch_y; py++) {
@@ -105,7 +105,7 @@ static void test_entire_volume_patch()
}
}
-void test_cxx11_tensor_volume_patch()
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch)
{
CALL_SUBTEST(test_single_voxel_patch());
CALL_SUBTEST(test_entire_volume_patch());
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
new file mode 100644
index 000000000..8d99a48ed
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+static const int DataLayout = ColMajor;
+
+template <typename DataType, typename IndexType>
+static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+
+IndexType sizeDim0 = 4;
+IndexType sizeDim1 = 2;
+IndexType sizeDim2 = 3;
+IndexType sizeDim3 = 5;
+IndexType sizeDim4 = 7;
+array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}};
+Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+tensor_col_major.setRandom();
+
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+
+
+ // single volume patch: ColMajor
+ array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}};
+ Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_voxel_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange);
+ gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1);
+ sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize);
+
+
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5);
+ VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7);
+
+ array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}};
+ Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_single_voxel_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange);
+ gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1);
+ sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize);
+
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+ VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+ for (IndexType i = 0; i < tensor_col_major.size(); ++i) {
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+ VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]);
+ }
+
+
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_single_voxel_patch_col_major);
+ sycl_device.deallocate(gpu_data_single_voxel_patch_row_major);
+}
+
+template <typename DataType, typename IndexType>
+static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device)
+{
+ const int depth = 4;
+ const int patch_z = 2;
+ const int patch_y = 3;
+ const int patch_x = 5;
+ const int batch = 7;
+
+ array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}};
+ array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}};
+ Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange);
+ Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange);
+ tensor_col_major.setRandom();
+
+
+ DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType)));
+ DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType)));
+ TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange);
+ TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange);
+
+ sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType));
+ gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout();
+ sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType));
+
+
+ // single volume patch: ColMajor
+ array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}};
+ Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange);
+ size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_volume_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange);
+ gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x);
+ sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize);
+
+
+// Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+// tensor.setRandom();
+// Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+ //Tensor<float, 6> entire_volume_patch;
+ //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch);
+
+// Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+ //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+
+ array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}};
+ Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange);
+ patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType);
+ DataType* gpu_data_entire_volume_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize));
+ TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange);
+ gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+ sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize);
+
+
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+ const int dz = patch_z - 1;
+ const int dy = patch_y - 1;
+ const int dx = patch_x - 1;
+
+ const int forward_pad_z = dz / 2;
+ const int forward_pad_y = dy / 2;
+ const int forward_pad_x = dx / 2;
+
+ for (int pz = 0; pz < patch_z; pz++) {
+ for (int py = 0; py < patch_y; py++) {
+ for (int px = 0; px < patch_x; px++) {
+ const int patchId = pz + patch_z * (py + px * patch_y);
+ for (int z = 0; z < patch_z; z++) {
+ for (int y = 0; y < patch_y; y++) {
+ for (int x = 0; x < patch_x; x++) {
+ for (int b = 0; b < batch; b++) {
+ for (int d = 0; d < depth; d++) {
+ float expected = 0.0f;
+ float expected_row_major = 0.0f;
+ const int eff_z = z - forward_pad_z + pz;
+ const int eff_y = y - forward_pad_y + py;
+ const int eff_x = x - forward_pad_x + px;
+ if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+ eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+ expected = tensor_col_major(d, eff_z, eff_y, eff_x, b);
+ expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+ }
+ VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected);
+ VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ sycl_device.deallocate(gpu_data_col_major);
+ sycl_device.deallocate(gpu_data_row_major);
+ sycl_device.deallocate(gpu_data_entire_volume_patch_col_major);
+ sycl_device.deallocate(gpu_data_entire_volume_patch_row_major);
+}
+
+
+
+template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){
+QueueInterface queueInterface(s);
+auto sycl_device = Eigen::SyclDevice(&queueInterface);
+std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl;
+test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device);
+test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
+}
+EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
+{
+for (const auto& device :Eigen::get_sycl_supported_devices()) {
+ CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
+}
+}
diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp
index 2b11807c8..5f63161b2 100644
--- a/unsupported/test/dgmres.cpp
+++ b/unsupported/test/dgmres.cpp
@@ -9,7 +9,7 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "../../test/sparse_solver.h"
-#include <Eigen/src/IterativeSolvers/DGMRES.h>
+#include <unsupported/Eigen/IterativeSolvers>
template<typename T> void test_dgmres_T()
{
@@ -24,7 +24,7 @@ template<typename T> void test_dgmres_T()
//CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor) );
}
-void test_dgmres()
+EIGEN_DECLARE_TEST(dgmres)
{
CALL_SUBTEST_1(test_dgmres_T<double>());
CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index 866db8e86..14a909d3b 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp
@@ -35,7 +35,7 @@ struct TestFunc1
int m_inputs, m_values;
TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
- TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+ TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {}
int inputs() const { return m_inputs; }
int values() const { return m_values; }
@@ -119,7 +119,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f)
VERIFY_IS_APPROX(j, jref);
}
-void test_forward_adolc()
+EIGEN_DECLARE_TEST(forward_adolc)
{
adtl::setNumDir(NUMBER_DIRECTIONS);
@@ -132,7 +132,7 @@ void test_forward_adolc()
}
{
- // simple instanciation tests
+ // simple instantiation tests
Matrix<adtl::adouble,2,1> x;
foo(x);
Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp
index f2969116b..8d2254b5b 100644
--- a/unsupported/test/gmres.cpp
+++ b/unsupported/test/gmres.cpp
@@ -24,7 +24,7 @@ template<typename T> void test_gmres_T()
//CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor) );
}
-void test_gmres()
+EIGEN_DECLARE_TEST(gmres)
{
CALL_SUBTEST_1(test_gmres_T<double>());
CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
diff --git a/unsupported/test/idrs.cpp b/unsupported/test/idrs.cpp
new file mode 100644
index 000000000..f88c01632
--- /dev/null
+++ b/unsupported/test/idrs.cpp
@@ -0,0 +1,27 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_idrs_T()
+{
+ IDRS<SparseMatrix<T>, DiagonalPreconditioner<T> > idrs_colmajor_diag;
+ IDRS<SparseMatrix<T>, IncompleteLUT<T> > idrs_colmajor_ilut;
+
+ CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_diag) );
+ CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_ilut) );
+}
+
+EIGEN_DECLARE_TEST(idrs)
+{
+ CALL_SUBTEST_1(test_idrs_T<double>());
+ CALL_SUBTEST_2(test_idrs_T<std::complex<double> >());
+}
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index e770049e5..b5b764c65 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,6 +9,7 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
#ifdef EIGEN_TEST_PART_1
#include "sparse.h"
@@ -83,7 +84,7 @@ void check_sparse_kronecker_product(const MatrixType& ab)
}
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
{
// DM = dense matrix; SM = sparse matrix
@@ -95,7 +96,7 @@ void test_kronecker_product()
SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
SM_a.insert(1,1) = DM_a.coeffRef(1,1) = 0.6469156566545853;
SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
-
+
MatrixXd DM_b(3,2);
SparseMatrix<double> SM_b(3,2);
SM_b.insert(0,0) = DM_b.coeffRef(0,0) = 0.9004440976767099;
@@ -165,7 +166,7 @@ void test_kronecker_product()
SM_a.insert(0,3) = -0.2;
SM_a.insert(2,4) = 0.3;
SM_a.finalize();
-
+
SM_b.insert(0,0) = 0.4;
SM_b.insert(2,1) = -0.5;
SM_b.finalize();
@@ -183,7 +184,7 @@ void test_kronecker_product()
DM_b2.resize(4,8);
DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
-
+
for(int i = 0; i < g_repeat; i++)
{
double density = Eigen::internal::random<double>(0.01,0.5);
@@ -196,35 +197,35 @@ void test_kronecker_product()
MatrixXf dA(ra,ca), dB(rb,cb), dC;
initSparse(density, dA, sA);
initSparse(density, dB, sB);
-
+
sC = kroneckerProduct(sA,sB);
dC = kroneckerProduct(dA,dB);
VERIFY_IS_APPROX(MatrixXf(sC),dC);
-
+
sC = kroneckerProduct(sA.transpose(),sB);
dC = kroneckerProduct(dA.transpose(),dB);
VERIFY_IS_APPROX(MatrixXf(sC),dC);
-
+
sC = kroneckerProduct(sA.transpose(),sB.transpose());
dC = kroneckerProduct(dA.transpose(),dB.transpose());
VERIFY_IS_APPROX(MatrixXf(sC),dC);
-
+
sC = kroneckerProduct(sA,sB.transpose());
dC = kroneckerProduct(dA,dB.transpose());
VERIFY_IS_APPROX(MatrixXf(sC),dC);
-
+
sC2 = kroneckerProduct(sA,sB);
dC = kroneckerProduct(dA,dB);
VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-
+
sC2 = kroneckerProduct(dA,sB);
dC = kroneckerProduct(dA,dB);
VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-
+
sC2 = kroneckerProduct(sA,dB);
dC = kroneckerProduct(dA,dB);
VERIFY_IS_APPROX(MatrixXf(sC2),dC);
-
+
sC2 = kroneckerProduct(2*sA,sB);
dC = kroneckerProduct(2*dA,dB);
VERIFY_IS_APPROX(MatrixXf(sC2),dC);
@@ -236,11 +237,10 @@ void test_kronecker_product()
#ifdef EIGEN_TEST_PART_2
// simply check that for a dense kronecker product, sparse module is not needed
-
#include "main.h"
#include <Eigen/KroneckerProduct>
-void test_kronecker_product()
+EIGEN_DECLARE_TEST(kronecker_product)
{
MatrixXd a(2,2), b(3,3), c;
a.setRandom();
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 64f168c16..7f9a81cd3 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -1445,7 +1445,7 @@ void testNistEckerle4(void)
VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
}
-void test_levenberg_marquardt()
+EIGEN_DECLARE_TEST(levenberg_marquardt)
{
// Tests using the examples provided by (c)minpack
CALL_SUBTEST(testLmder1());
diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp
index 50dec083d..b032cbf1d 100644
--- a/unsupported/test/matrix_exponential.cpp
+++ b/unsupported/test/matrix_exponential.cpp
@@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol)
}
}
-void test_matrix_exponential()
+EIGEN_DECLARE_TEST(matrix_exponential)
{
CALL_SUBTEST_2(test2dRotation<double>(1e-13));
CALL_SUBTEST_1(test2dRotation<float>(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 7c9b68a3c..6d753737d 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -23,9 +23,8 @@ inline bool test_isApprox_abs(const Type1& a, const Type2& b)
// Returns a matrix with eigenvalues clustered around 0, 1 and 2.
template<typename MatrixType>
-MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size)
+MatrixType randomMatrixWithRealEivals(const Index size)
{
- typedef typename MatrixType::Index Index;
typedef typename MatrixType::Scalar Scalar;
typedef typename MatrixType::RealScalar RealScalar;
MatrixType diag = MatrixType::Zero(size, size);
@@ -42,16 +41,15 @@ template <typename MatrixType, int IsComplex = NumTraits<typename internal::trai
struct randomMatrixWithImagEivals
{
// Returns a matrix with eigenvalues clustered around 0 and +/- i.
- static MatrixType run(const typename MatrixType::Index size);
+ static MatrixType run(const Index size);
};
// Partial specialization for real matrices
template<typename MatrixType>
struct randomMatrixWithImagEivals<MatrixType, 0>
{
- static MatrixType run(const typename MatrixType::Index size)
+ static MatrixType run(const Index size)
{
- typedef typename MatrixType::Index Index;
typedef typename MatrixType::Scalar Scalar;
MatrixType diag = MatrixType::Zero(size, size);
Index i = 0;
@@ -77,9 +75,8 @@ struct randomMatrixWithImagEivals<MatrixType, 0>
template<typename MatrixType>
struct randomMatrixWithImagEivals<MatrixType, 1>
{
- static MatrixType run(const typename MatrixType::Index size)
+ static MatrixType run(const Index size)
{
- typedef typename MatrixType::Index Index;
typedef typename MatrixType::Scalar Scalar;
typedef typename MatrixType::RealScalar RealScalar;
const Scalar imagUnit(0, 1);
@@ -171,7 +168,6 @@ void testMatrixType(const MatrixType& m)
{
// Matrices with clustered eigenvalue lead to different code paths
// in MatrixFunction.h and are thus useful for testing.
- typedef typename MatrixType::Index Index;
const Index size = m.rows();
for (int i = 0; i < g_repeat; i++) {
@@ -181,7 +177,40 @@ void testMatrixType(const MatrixType& m)
}
}
-void test_matrix_function()
+template<typename MatrixType>
+void testMapRef(const MatrixType& A)
+{
+ // Test if passing Ref and Map objects is possible
+ // (Regression test for Bug #1796)
+ Index size = A.rows();
+ MatrixType X; X.setRandom(size, size);
+ MatrixType Y(size,size);
+ Ref< MatrixType> R(Y);
+ Ref<const MatrixType> Rc(X);
+ Map< MatrixType> M(Y.data(), size, size);
+ Map<const MatrixType> Mc(X.data(), size, size);
+
+ X = X*X; // make sure sqrt is possible
+ Y = X.sqrt();
+ R = Rc.sqrt();
+ M = Mc.sqrt();
+ Y = X.exp();
+ R = Rc.exp();
+ M = Mc.exp();
+ X = Y; // make sure log is possible
+ Y = X.log();
+ R = Rc.log();
+ M = Mc.log();
+
+ Y = X.cos() + Rc.cos() + Mc.cos();
+ Y = X.sin() + Rc.sin() + Mc.sin();
+
+ Y = X.cosh() + Rc.cosh() + Mc.cosh();
+ Y = X.sinh() + Rc.sinh() + Mc.sinh();
+}
+
+
+EIGEN_DECLARE_TEST(matrix_function)
{
CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
@@ -190,4 +219,9 @@ void test_matrix_function()
CALL_SUBTEST_5(testMatrixType(Matrix<double,5,5,RowMajor>()));
CALL_SUBTEST_6(testMatrixType(Matrix4cd()));
CALL_SUBTEST_7(testMatrixType(MatrixXd(13,13)));
+
+ CALL_SUBTEST_1(testMapRef(Matrix<float,1,1>()));
+ CALL_SUBTEST_2(testMapRef(Matrix3cf()));
+ CALL_SUBTEST_3(testMapRef(MatrixXf(8,8)));
+ CALL_SUBTEST_7(testMapRef(MatrixXd(13,13)));
}
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index 7ccfacfdf..dbaf9dbdf 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -19,7 +19,7 @@ void test2dRotation(const T& tol)
MatrixPower<Matrix<T,2,2> > Apow(A);
for (int i=0; i<=20; ++i) {
- angle = std::pow(T(10), (i-10) / T(5.));
+ angle = std::pow(T(10), T(i-10) / T(5.));
c = std::cos(angle);
s = std::sin(angle);
B << c, s, -s, c;
@@ -61,7 +61,7 @@ void test3dRotation(const T& tol)
for (int i=0; i<=20; ++i) {
v = Matrix<T,3,1>::Random();
v.normalize();
- angle = std::pow(T(10), (i-10) / T(5.));
+ angle = std::pow(T(10), T(i-10) / T(5.));
VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
}
}
@@ -150,55 +150,55 @@ typedef Matrix<double,3,3,RowMajor> Matrix3dRowMajor;
typedef Matrix<long double,3,3> Matrix3e;
typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
-void test_matrix_power()
+EIGEN_DECLARE_TEST(matrix_power)
{
CALL_SUBTEST_2(test2dRotation<double>(1e-13));
- CALL_SUBTEST_1(test2dRotation<float>(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+ CALL_SUBTEST_1(test2dRotation<float>(2e-5f)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64
CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
- CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+ CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f));
CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
CALL_SUBTEST_10(test3dRotation<double>(1e-13));
- CALL_SUBTEST_11(test3dRotation<float>(1e-5));
+ CALL_SUBTEST_11(test3dRotation<float>(1e-5f));
CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
CALL_SUBTEST_2(testGeneral(Matrix2d(), 1e-13));
CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
CALL_SUBTEST_3(testGeneral(Matrix4cd(), 1e-13));
CALL_SUBTEST_4(testGeneral(MatrixXd(8,8), 2e-12));
- CALL_SUBTEST_1(testGeneral(Matrix2f(), 1e-4));
- CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4));
- CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4));
- CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3)); // see bug 614
+ CALL_SUBTEST_1(testGeneral(Matrix2f(), 1e-4f));
+ CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4f));
+ CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4f));
+ CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3f)); // see bug 614
CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13L));
CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13));
- CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4));
+ CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4f));
CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L));
CALL_SUBTEST_2(testSingular(Matrix2d(), 1e-13));
CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
CALL_SUBTEST_3(testSingular(Matrix4cd(), 1e-13));
CALL_SUBTEST_4(testSingular(MatrixXd(8,8), 2e-12));
- CALL_SUBTEST_1(testSingular(Matrix2f(), 1e-4));
- CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4));
- CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4));
- CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3));
+ CALL_SUBTEST_1(testSingular(Matrix2f(), 1e-4f));
+ CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4f));
+ CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4f));
+ CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3f));
CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13L));
CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13));
- CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4));
+ CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4f));
CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L));
CALL_SUBTEST_2(testLogThenExp(Matrix2d(), 1e-13));
CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
CALL_SUBTEST_3(testLogThenExp(Matrix4cd(), 1e-13));
CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8), 2e-12));
- CALL_SUBTEST_1(testLogThenExp(Matrix2f(), 1e-4));
- CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4));
- CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4));
- CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3));
+ CALL_SUBTEST_1(testLogThenExp(Matrix2f(), 1e-4f));
+ CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4f));
+ CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4f));
+ CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3f));
CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13L));
CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13));
- CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4));
+ CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4f));
CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L));
}
diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp
index ea541e1ea..034f29217 100644
--- a/unsupported/test/matrix_square_root.cpp
+++ b/unsupported/test/matrix_square_root.cpp
@@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m)
VERIFY_IS_APPROX(sqrtA * sqrtA, A);
}
-void test_matrix_square_root()
+EIGEN_DECLARE_TEST(matrix_square_root)
{
for (int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp
index 8b300b78a..2eb40fef6 100644
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@@ -36,7 +36,7 @@ template<typename T> void test_minres_T()
}
-void test_minres()
+EIGEN_DECLARE_TEST(minres)
{
CALL_SUBTEST_1(test_minres_T<double>());
// CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
deleted file mode 100644
index 8404f1ff8..000000000
--- a/unsupported/test/mpreal/mpreal.h
+++ /dev/null
@@ -1,3104 +0,0 @@
-/*
- MPFR C++: Multi-precision floating point number class for C++.
- Based on MPFR library: http://mpfr.org
-
- Project homepage: http://www.holoborodko.com/pavel/mpfr
- Contact e-mail: pavel@holoborodko.com
-
- Copyright (c) 2008-2015 Pavel Holoborodko
-
- Contributors:
- Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
- Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
- Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
- Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
- Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
- Rodney James, Jorge Leitao.
-
- Licensing:
- (A) MPFR C++ is under GNU General Public License ("GPL").
-
- (B) Non-free licenses may also be purchased from the author, for users who
- do not want their programs protected by the GPL.
-
- The non-free licenses are for users that wish to use MPFR C++ in
- their products but are unwilling to release their software
- under the GPL (which would require them to release source code
- and allow free redistribution).
-
- Such users can purchase an unlimited-use license from the author.
- Contact us for more details.
-
- GNU General Public License ("GPL") copyright permissions statement:
- **************************************************************************
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MPREAL_H__
-#define __MPREAL_H__
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <complex>
-#include <algorithm>
-
-// Options
-#define MPREAL_HAVE_MSVC_DEBUGVIEW // Enable Debugger Visualizer for "Debug" builds in MSVC.
-#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
- // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
- // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
-
-// Library version
-#define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 6
-#define MPREAL_VERSION_PATCHLEVEL 2
-#define MPREAL_VERSION_STRING "3.6.2"
-
-// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__)
- #define IsInf(x) (isinf)(x) // GNU C++/Intel ICC compiler on Linux
-#elif defined(_MSC_VER) // Microsoft Visual C++
- #define IsInf(x) (!_finite(x))
-#else
- #define IsInf(x) (std::isinf)(x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance
-#endif
-
-// A Clang feature extension to determine compiler features.
-#ifndef __has_feature
- #define __has_feature(x) 0
-#endif
-
-// Detect support for r-value references (move semantic). Borrowed from Eigen.
-#if (__has_feature(cxx_rvalue_references) || \
- defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
- (defined(_MSC_VER) && _MSC_VER >= 1600))
-
- #define MPREAL_HAVE_MOVE_SUPPORT
-
- // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
- #define mpfr_is_initialized(x) (0 != (x)->_mpfr_d)
- #define mpfr_set_uninitialized(x) ((x)->_mpfr_d = 0 )
-#endif
-
-// Detect support for explicit converters.
-#if (__has_feature(cxx_explicit_conversions) || \
- (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
- (defined(_MSC_VER) && _MSC_VER >= 1800))
-
- #define MPREAL_HAVE_EXPLICIT_CONVERTERS
-#endif
-
-#define MPFR_USE_INTMAX_T // Enable 64-bit integer types - should be defined before mpfr.h
-
-#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
- #define MPREAL_MSVC_DEBUGVIEW_CODE DebugView = toString();
- #define MPREAL_MSVC_DEBUGVIEW_DATA std::string DebugView;
-#else
- #define MPREAL_MSVC_DEBUGVIEW_CODE
- #define MPREAL_MSVC_DEBUGVIEW_DATA
-#endif
-
-#include <mpfr.h>
-
-#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
- #include <cstdlib> // Needed for random()
-#endif
-
-// Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1 // Triggers overflow exception during conversion to double if mpreal
- // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
- // = -1 disables overflow checks (default)
-
-// Fast replacement for mpfr_set_zero(x, +1):
-// (a) uses low-level data members, might not be compatible with new versions of MPFR
-// (b) sign is not set, add (x)->_mpfr_sign = 1;
-#define mpfr_set_zero_fast(x) ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
-
-#if defined(__GNUC__)
- #define MPREAL_PERMISSIVE_EXPR __extension__
-#else
- #define MPREAL_PERMISSIVE_EXPR
-#endif
-
-namespace mpfr {
-
-class mpreal {
-private:
- mpfr_t mp;
-
-public:
-
- // Get default rounding mode & precision
- inline static mp_rnd_t get_default_rnd() { return (mp_rnd_t)(mpfr_get_default_rounding_mode()); }
- inline static mp_prec_t get_default_prec() { return mpfr_get_default_prec(); }
-
- // Constructors && type conversions
- mpreal();
- mpreal(const mpreal& u);
- mpreal(const mpf_t u);
- mpreal(const mpz_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const mpq_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const long double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const unsigned long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const unsigned int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-
- // Construct mpreal from mpfr_t structure.
- // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
- mpreal(const mpfr_t u, bool shared = false);
-
- mpreal(const char* s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const std::string& s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-
- ~mpreal();
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
- mpreal& operator=(mpreal&& v);
- mpreal(mpreal&& u);
-#endif
-
- // Operations
- // =
- // +, -, *, /, ++, --, <<, >>
- // *=, +=, -=, /=,
- // <, >, ==, <=, >=
-
- // =
- mpreal& operator=(const mpreal& v);
- mpreal& operator=(const mpf_t v);
- mpreal& operator=(const mpz_t v);
- mpreal& operator=(const mpq_t v);
- mpreal& operator=(const long double v);
- mpreal& operator=(const double v);
- mpreal& operator=(const unsigned long int v);
- mpreal& operator=(const unsigned long long int v);
- mpreal& operator=(const long long int v);
- mpreal& operator=(const unsigned int v);
- mpreal& operator=(const long int v);
- mpreal& operator=(const int v);
- mpreal& operator=(const char* s);
- mpreal& operator=(const std::string& s);
- template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
-
- // +
- mpreal& operator+=(const mpreal& v);
- mpreal& operator+=(const mpf_t v);
- mpreal& operator+=(const mpz_t v);
- mpreal& operator+=(const mpq_t v);
- mpreal& operator+=(const long double u);
- mpreal& operator+=(const double u);
- mpreal& operator+=(const unsigned long int u);
- mpreal& operator+=(const unsigned int u);
- mpreal& operator+=(const long int u);
- mpreal& operator+=(const int u);
-
- mpreal& operator+=(const long long int u);
- mpreal& operator+=(const unsigned long long int u);
- mpreal& operator-=(const long long int u);
- mpreal& operator-=(const unsigned long long int u);
- mpreal& operator*=(const long long int u);
- mpreal& operator*=(const unsigned long long int u);
- mpreal& operator/=(const long long int u);
- mpreal& operator/=(const unsigned long long int u);
-
- const mpreal operator+() const;
- mpreal& operator++ ();
- const mpreal operator++ (int);
-
- // -
- mpreal& operator-=(const mpreal& v);
- mpreal& operator-=(const mpz_t v);
- mpreal& operator-=(const mpq_t v);
- mpreal& operator-=(const long double u);
- mpreal& operator-=(const double u);
- mpreal& operator-=(const unsigned long int u);
- mpreal& operator-=(const unsigned int u);
- mpreal& operator-=(const long int u);
- mpreal& operator-=(const int u);
- const mpreal operator-() const;
- friend const mpreal operator-(const unsigned long int b, const mpreal& a);
- friend const mpreal operator-(const unsigned int b, const mpreal& a);
- friend const mpreal operator-(const long int b, const mpreal& a);
- friend const mpreal operator-(const int b, const mpreal& a);
- friend const mpreal operator-(const double b, const mpreal& a);
- mpreal& operator-- ();
- const mpreal operator-- (int);
-
- // *
- mpreal& operator*=(const mpreal& v);
- mpreal& operator*=(const mpz_t v);
- mpreal& operator*=(const mpq_t v);
- mpreal& operator*=(const long double v);
- mpreal& operator*=(const double v);
- mpreal& operator*=(const unsigned long int v);
- mpreal& operator*=(const unsigned int v);
- mpreal& operator*=(const long int v);
- mpreal& operator*=(const int v);
-
- // /
- mpreal& operator/=(const mpreal& v);
- mpreal& operator/=(const mpz_t v);
- mpreal& operator/=(const mpq_t v);
- mpreal& operator/=(const long double v);
- mpreal& operator/=(const double v);
- mpreal& operator/=(const unsigned long int v);
- mpreal& operator/=(const unsigned int v);
- mpreal& operator/=(const long int v);
- mpreal& operator/=(const int v);
- friend const mpreal operator/(const unsigned long int b, const mpreal& a);
- friend const mpreal operator/(const unsigned int b, const mpreal& a);
- friend const mpreal operator/(const long int b, const mpreal& a);
- friend const mpreal operator/(const int b, const mpreal& a);
- friend const mpreal operator/(const double b, const mpreal& a);
-
- //<<= Fast Multiplication by 2^u
- mpreal& operator<<=(const unsigned long int u);
- mpreal& operator<<=(const unsigned int u);
- mpreal& operator<<=(const long int u);
- mpreal& operator<<=(const int u);
-
- //>>= Fast Division by 2^u
- mpreal& operator>>=(const unsigned long int u);
- mpreal& operator>>=(const unsigned int u);
- mpreal& operator>>=(const long int u);
- mpreal& operator>>=(const int u);
-
- // Type Conversion operators
- bool toBool ( ) const;
- long toLong (mp_rnd_t mode = GMP_RNDZ) const;
- unsigned long toULong (mp_rnd_t mode = GMP_RNDZ) const;
- long long toLLong (mp_rnd_t mode = GMP_RNDZ) const;
- unsigned long long toULLong (mp_rnd_t mode = GMP_RNDZ) const;
- float toFloat (mp_rnd_t mode = GMP_RNDN) const;
- double toDouble (mp_rnd_t mode = GMP_RNDN) const;
- long double toLDouble (mp_rnd_t mode = GMP_RNDN) const;
-
-#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
- explicit operator bool () const { return toBool(); }
- explicit operator int () const { return int(toLong()); }
- explicit operator long () const { return toLong(); }
- explicit operator long long () const { return toLLong(); }
- explicit operator unsigned () const { return unsigned(toULong()); }
- explicit operator unsigned long () const { return toULong(); }
- explicit operator unsigned long long () const { return toULLong(); }
- explicit operator float () const { return toFloat(); }
- explicit operator double () const { return toDouble(); }
- explicit operator long double () const { return toLDouble(); }
-#endif
-
- // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
- ::mpfr_ptr mpfr_ptr();
- ::mpfr_srcptr mpfr_ptr() const;
- ::mpfr_srcptr mpfr_srcptr() const;
-
- // Convert mpreal to string with n significant digits in base b
- // n = -1 -> convert with the maximum available digits
- std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- std::string toString(const std::string& format) const;
-#endif
-
- std::ostream& output(std::ostream& os) const;
-
- // Math Functions
- friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
- friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
- friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
- friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
- friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
- friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
- friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
- friend int cmpabs(const mpreal& a,const mpreal& b);
-
- friend const mpreal log (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal exp (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
- friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal acos (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asin (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal atan (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
- friend const mpreal acot (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asec (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acsc (const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal cosh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sinh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal tanh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sech (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal csch (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal coth (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
- friend const mpreal fac_ui (unsigned long int v, mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal eint (const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal gamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal tgamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal lngamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal lgamma (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
- friend const mpreal zeta (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal erf (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal erfc (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
- friend const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
- friend const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
- friend const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
- friend int sgn(const mpreal& v); // returns -1 or +1
-
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- friend int sinh_cosh (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal li2 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
- friend const mpreal rec_sqrt (const mpreal& v, mp_rnd_t rnd_mode);
-
- // MATLAB's semantic equivalents
- friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
- friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- friend const mpreal digamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal ai (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
- friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear
- friend const mpreal grandom (unsigned int seed);
-#endif
-
- // Uniformly distributed random number generation in [0,1] using
- // Mersenne-Twister algorithm by default.
- // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
- // Check urandom() for more precise control.
- friend const mpreal random(unsigned int seed);
-
- // Splits mpreal value into fractional and integer parts.
- // Returns fractional part and stores integer part in n.
- friend const mpreal modf(const mpreal& v, mpreal& n);
-
- // Constants
- // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
- friend const mpreal const_log2 (mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal const_pi (mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal const_euler (mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal const_catalan (mp_prec_t prec, mp_rnd_t rnd_mode);
-
- // returns +inf iff sign>=0 otherwise -inf
- friend const mpreal const_infinity(int sign, mp_prec_t prec);
-
- // Output/ Input
- friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
- friend std::istream& operator>>(std::istream& is, mpreal& v);
-
- // Integer Related Functions
- friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal ceil (const mpreal& v);
- friend const mpreal floor(const mpreal& v);
- friend const mpreal round(const mpreal& v);
- friend const mpreal trunc(const mpreal& v);
- friend const mpreal rint_ceil (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal rint_floor (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal rint_round (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal rint_trunc (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal frac (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal remainder ( const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
- friend const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
- // Miscellaneous Functions
- friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
- friend const mpreal nextabove (const mpreal& x);
- friend const mpreal nextbelow (const mpreal& x);
-
- // use gmp_randinit_default() to init state, gmp_randclear() to clear
- friend const mpreal urandomb (gmp_randstate_t& state);
-
-// MPFR < 2.4.2 Specifics
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
- friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
-#endif
-
- // Instance Checkers
- friend bool (isnan) (const mpreal& v);
- friend bool (isinf) (const mpreal& v);
- friend bool (isfinite) (const mpreal& v);
-
- friend bool isnum (const mpreal& v);
- friend bool iszero (const mpreal& v);
- friend bool isint (const mpreal& v);
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- friend bool isregular(const mpreal& v);
-#endif
-
- // Set/Get instance properties
- inline mp_prec_t get_prec() const;
- inline void set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd()); // Change precision with rounding mode
-
- // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
- inline mpreal& setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
- inline int getPrecision() const;
-
- // Set mpreal to +/- inf, NaN, +/-0
- mpreal& setInf (int Sign = +1);
- mpreal& setNan ();
- mpreal& setZero (int Sign = +1);
- mpreal& setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
-
- //Exponent
- mp_exp_t get_exp();
- int set_exp(mp_exp_t e);
- int check_range (int t, mp_rnd_t rnd_mode = get_default_rnd());
- int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
-
- // Inexact conversion from float
- inline bool fits_in_bits(double x, int n);
-
- // Set/Get global properties
- static void set_default_prec(mp_prec_t prec);
- static void set_default_rnd(mp_rnd_t rnd_mode);
-
- static mp_exp_t get_emin (void);
- static mp_exp_t get_emax (void);
- static mp_exp_t get_emin_min (void);
- static mp_exp_t get_emin_max (void);
- static mp_exp_t get_emax_min (void);
- static mp_exp_t get_emax_max (void);
- static int set_emin (mp_exp_t exp);
- static int set_emax (mp_exp_t exp);
-
- // Efficient swapping of two mpreal values - needed for std algorithms
- friend void swap(mpreal& x, mpreal& y);
-
- friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
- friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-private:
- // Human friendly Debug Preview in Visual Studio.
- // Put one of these lines:
- //
- // mpfr::mpreal=<DebugView> ; Show value only
- // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits ; Show value & precision
- //
- // at the beginning of
- // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
- MPREAL_MSVC_DEBUGVIEW_DATA
-
- // "Smart" resources deallocation. Checks if instance initialized before deletion.
- void clear(::mpfr_ptr);
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Exceptions
-class conversion_overflow : public std::exception {
-public:
- std::string why() { return "inexact conversion from floating point"; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Constructors & converters
-// Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal()
-{
- mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
- mpfr_set_zero_fast(mpfr_ptr());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpreal& u)
-{
- mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
- mpfr_set (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-inline mpreal::mpreal(mpreal&& other)
-{
- mpfr_set_uninitialized(mpfr_ptr()); // make sure "other" holds no pointer to actual data
- mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal& mpreal::operator=(mpreal&& other)
-{
- mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-#endif
-
-inline mpreal::mpreal(const mpfr_t u, bool shared)
-{
- if(shared)
- {
- std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
- }
- else
- {
- mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
- mpfr_set (mpfr_ptr(), u, mpreal::get_default_rnd());
- }
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpf_t u)
-{
- mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
- mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2(mpfr_ptr(), prec);
- mpfr_set_z(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2(mpfr_ptr(), prec);
- mpfr_set_q(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2(mpfr_ptr(), prec);
-
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
- if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
- {
- mpfr_set_d(mpfr_ptr(), u, mode);
- }else
- throw conversion_overflow();
-#else
- mpfr_set_d(mpfr_ptr(), u, mode);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_ld(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_uj(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_sj(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_ui(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_ui(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_si(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_si(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_str(mpfr_ptr(), s, base, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline void mpreal::clear(::mpfr_ptr x)
-{
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
- if(mpfr_is_initialized(x))
-#endif
- mpfr_clear(x);
-}
-
-inline mpreal::~mpreal()
-{
- clear(mpfr_ptr());
-}
-
-// internal namespace needed for template magic
-namespace internal{
-
- // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
- // This is needed for smooth integration with libraries based on expression templates, like Eigen.
- // TODO: Do the same for boolean operators.
- template <typename ArgumentType> struct result_type {};
-
- template <> struct result_type<mpreal> {typedef mpreal type;};
- template <> struct result_type<mpz_t> {typedef mpreal type;};
- template <> struct result_type<mpq_t> {typedef mpreal type;};
- template <> struct result_type<long double> {typedef mpreal type;};
- template <> struct result_type<double> {typedef mpreal type;};
- template <> struct result_type<unsigned long int> {typedef mpreal type;};
- template <> struct result_type<unsigned int> {typedef mpreal type;};
- template <> struct result_type<long int> {typedef mpreal type;};
- template <> struct result_type<int> {typedef mpreal type;};
- template <> struct result_type<long long> {typedef mpreal type;};
- template <> struct result_type<unsigned long long> {typedef mpreal type;};
-}
-
-// + Addition
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs; }
-
-// - Subtraction
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs; }
-
-// * Multiplication
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs; }
-
-// / Division
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs; }
-
-//////////////////////////////////////////////////////////////////////////
-// sqrt
-const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-// abs
-inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// pow
-const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// Estimate machine epsilon for the given precision
-// Returns smallest eps such that 1.0 + eps != 1.0
-inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
-
-// Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);
-
-// Gives max & min values for the required precision,
-// minval is 'safe' meaning 1 / minval does not overflow
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
-inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
-
-// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
-
-// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
-
-// 'Bitwise' equality check
-// maxUlps - a and b can be apart by maxUlps binary numbers.
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
-
-//////////////////////////////////////////////////////////////////////////
-// Convert precision in 'bits' to decimal digits and vice versa.
-// bits = ceil(digits*log[2](10))
-// digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d);
-inline int bits2digits(mp_prec_t b);
-
-//////////////////////////////////////////////////////////////////////////
-// min, max
-const mpreal (max)(const mpreal& x, const mpreal& y);
-const mpreal (min)(const mpreal& x, const mpreal& y);
-
-//////////////////////////////////////////////////////////////////////////
-// Implementation
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-// Operators - Assignment
-inline mpreal& mpreal::operator=(const mpreal& v)
-{
- if (this != &v)
- {
- mp_prec_t tp = mpfr_get_prec( mpfr_srcptr());
- mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
-
- if(tp != vp){
- clear(mpfr_ptr());
- mpfr_init2(mpfr_ptr(), vp);
- }
-
- mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- }
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpf_t v)
-{
- mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpz_t v)
-{
- mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpq_t v)
-{
- mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const long double v)
-{
- mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const double v)
-{
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
- if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
- {
- mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
- }else
- throw conversion_overflow();
-#else
- mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long int v)
-{
- mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned int v)
-{
- mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long long int v)
-{
- mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const long long int v)
-{
- mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const long int v)
-{
- mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const int v)
-{
- mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const char* s)
-{
- // Use other converters for more precise control on base & precision & rounding:
- //
- // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
- // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
- //
- // Here we assume base = 10 and we use precision of target variable.
-
- mpfr_t t;
-
- mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
- if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
- {
- mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- }
-
- clear(t);
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const std::string& s)
-{
- // Use other converters for more precise control on base & precision & rounding:
- //
- // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
- // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
- //
- // Here we assume base = 10 and we use precision of target variable.
-
- mpfr_t t;
-
- mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
- if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
- {
- mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- }
-
- clear(t);
- return *this;
-}
-
-template <typename real_t>
-inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
-{
- return *this = z.real();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// + Addition
-inline mpreal& mpreal::operator+=(const mpreal& v)
-{
- mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpf_t u)
-{
- *this += mpreal(u);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpz_t u)
-{
- mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpq_t u)
-{
- mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+= (const long double u)
-{
- *this += mpreal(u);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+= (const double u)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-#else
- *this += mpreal(u);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned long int u)
-{
- mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned int u)
-{
- mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long int u)
-{
- mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const int u)
-{
- mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long long int u) { *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator+=(const unsigned long long int u){ *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator-=(const long long int u) { *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator-=(const unsigned long long int u){ *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator*=(const long long int u) { *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator*=(const unsigned long long int u){ *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator/=(const long long int u) { *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator/=(const unsigned long long int u){ *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-
-inline const mpreal mpreal::operator+()const { return mpreal(*this); }
-
-inline const mpreal operator+(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
- mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-inline mpreal& mpreal::operator++()
-{
- return *this += 1;
-}
-
-inline const mpreal mpreal::operator++ (int)
-{
- mpreal x(*this);
- *this += 1;
- return x;
-}
-
-inline mpreal& mpreal::operator--()
-{
- return *this -= 1;
-}
-
-inline const mpreal mpreal::operator-- (int)
-{
- mpreal x(*this);
- *this -= 1;
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// - Subtraction
-inline mpreal& mpreal::operator-=(const mpreal& v)
-{
- mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpz_t v)
-{
- mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpq_t v)
-{
- mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long double v)
-{
- *this -= mpreal(v);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
- *this -= mpreal(v);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned long int v)
-{
- mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned int v)
-{
- mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long int v)
-{
- mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const int v)
-{
- mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal mpreal::operator-()const
-{
- mpreal u(*this);
- mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
- return u;
-}
-
-inline const mpreal operator-(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
- mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-inline const mpreal operator-(const double b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-#else
- mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
- x -= a;
- return x;
-#endif
-}
-
-inline const mpreal operator-(const unsigned long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator-(const unsigned int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator-(const long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator-(const int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// * Multiplication
-inline mpreal& mpreal::operator*= (const mpreal& v)
-{
- mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpz_t v)
-{
- mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpq_t v)
-{
- mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long double v)
-{
- *this *= mpreal(v);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
- *this *= mpreal(v);
-#endif
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned long int v)
-{
- mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned int v)
-{
- mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long int v)
-{
- mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const int v)
-{
- mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal operator*(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
- mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// / Division
-inline mpreal& mpreal::operator/=(const mpreal& v)
-{
- mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpz_t v)
-{
- mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpq_t v)
-{
- mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long double v)
-{
- *this /= mpreal(v);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
- *this /= mpreal(v);
-#endif
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned long int v)
-{
- mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned int v)
-{
- mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long int v)
-{
- mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const int v)
-{
- mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal operator/(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
- mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-inline const mpreal operator/(const unsigned long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const unsigned int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const double b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-#else
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- x /= a;
- return x;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Shifts operators - Multiplication/Division by power of 2
-inline mpreal& mpreal::operator<<=(const unsigned long int u)
-{
- mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const unsigned int u)
-{
- mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const long int u)
-{
- mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const int u)
-{
- mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned long int u)
-{
- mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned int u)
-{
- mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const long int u)
-{
- mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const int u)
-{
- mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
-{
- return mul_2ui(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned int k)
-{
- return mul_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator<<(const mpreal& v, const long int k)
-{
- return mul_2si(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const int k)
-{
- return mul_2si(v,static_cast<long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
-{
- return div_2ui(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const long int k)
-{
- return div_2si(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned int k)
-{
- return div_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const int k)
-{
- return div_2si(v,static_cast<long int>(k));
-}
-
-// mul_2ui
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-// mul_2si
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//Relational operators
-
-// WARNING:
-//
-// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
-//
-// isnan(b) = (b != b)
-// isnan(b) = !(b == b) (we use in code below)
-//
-// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
-// Use std::isnan instead (C++11).
-
-inline bool operator > (const mpreal& a, const mpreal& b ){ return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator > (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 ); }
-
-inline bool operator >= (const mpreal& a, const mpreal& b ){ return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator >= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); }
-// inline bool operator >= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 ); }
-
-inline bool operator < (const mpreal& a, const mpreal& b ){ return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator < (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 ); }
-
-inline bool operator <= (const mpreal& a, const mpreal& b ){ return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator <= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 ); }
-
-inline bool operator == (const mpreal& a, const mpreal& b ){ return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator == (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 ); }
-
-inline bool operator != (const mpreal& a, const mpreal& b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const unsigned long int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const unsigned int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const long int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const long double b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const double b ){ return !(a == b); }
-
-inline bool (isnan) (const mpreal& op){ return (mpfr_nan_p (op.mpfr_srcptr()) != 0 ); }
-inline bool (isinf) (const mpreal& op){ return (mpfr_inf_p (op.mpfr_srcptr()) != 0 ); }
-inline bool (isfinite) (const mpreal& op){ return (mpfr_number_p (op.mpfr_srcptr()) != 0 ); }
-inline bool iszero (const mpreal& op){ return (mpfr_zero_p (op.mpfr_srcptr()) != 0 ); }
-inline bool isint (const mpreal& op){ return (mpfr_integer_p(op.mpfr_srcptr()) != 0 ); }
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline bool isregular(const mpreal& op){ return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Type Converters
-inline bool mpreal::toBool ( ) const { return mpfr_zero_p (mpfr_srcptr()) == 0; }
-inline long mpreal::toLong (mp_rnd_t mode) const { return mpfr_get_si (mpfr_srcptr(), mode); }
-inline unsigned long mpreal::toULong (mp_rnd_t mode) const { return mpfr_get_ui (mpfr_srcptr(), mode); }
-inline float mpreal::toFloat (mp_rnd_t mode) const { return mpfr_get_flt(mpfr_srcptr(), mode); }
-inline double mpreal::toDouble (mp_rnd_t mode) const { return mpfr_get_d (mpfr_srcptr(), mode); }
-inline long double mpreal::toLDouble(mp_rnd_t mode) const { return mpfr_get_ld (mpfr_srcptr(), mode); }
-inline long long mpreal::toLLong (mp_rnd_t mode) const { return mpfr_get_sj (mpfr_srcptr(), mode); }
-inline unsigned long long mpreal::toULLong (mp_rnd_t mode) const { return mpfr_get_uj (mpfr_srcptr(), mode); }
-
-inline ::mpfr_ptr mpreal::mpfr_ptr() { return mp; }
-inline ::mpfr_srcptr mpreal::mpfr_ptr() const { return mp; }
-inline ::mpfr_srcptr mpreal::mpfr_srcptr() const { return mp; }
-
-template <class T>
-inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
-{
- std::ostringstream oss;
- oss << f << t;
- return oss.str();
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline std::string mpreal::toString(const std::string& format) const
-{
- char *s = NULL;
- std::string out;
-
- if( !format.empty() )
- {
- if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
- {
- out = std::string(s);
-
- mpfr_free_str(s);
- }
- }
-
- return out;
-}
-
-#endif
-
-inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
-{
- // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
- (void)b;
- (void)mode;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
- std::ostringstream format;
-
- int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
-
- format << "%." << digits << "RNg";
-
- return toString(format.str());
-
-#else
-
- char *s, *ns = NULL;
- size_t slen, nslen;
- mp_exp_t exp;
- std::string out;
-
- if(mpfr_inf_p(mp))
- {
- if(mpfr_sgn(mp)>0) return "+Inf";
- else return "-Inf";
- }
-
- if(mpfr_zero_p(mp)) return "0";
- if(mpfr_nan_p(mp)) return "NaN";
-
- s = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
- ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
-
- if(s!=NULL && ns!=NULL)
- {
- slen = strlen(s);
- nslen = strlen(ns);
- if(nslen<=slen)
- {
- mpfr_free_str(s);
- s = ns;
- slen = nslen;
- }
- else {
- mpfr_free_str(ns);
- }
-
- // Make human eye-friendly formatting if possible
- if (exp>0 && static_cast<size_t>(exp)<slen)
- {
- if(s[0]=='-')
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s+exp) ptr--;
-
- if(ptr==s+exp) out = std::string(s,exp+1);
- else out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
-
- //out = string(s,exp+1)+'.'+string(s+exp+1);
- }
- else
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s+exp-1) ptr--;
-
- if(ptr==s+exp-1) out = std::string(s,exp);
- else out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
-
- //out = string(s,exp)+'.'+string(s+exp);
- }
-
- }else{ // exp<0 || exp>slen
- if(s[0]=='-')
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s+1) ptr--;
-
- if(ptr==s+1) out = std::string(s,2);
- else out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
-
- //out = string(s,2)+'.'+string(s+2);
- }
- else
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s) ptr--;
-
- if(ptr==s) out = std::string(s,1);
- else out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
-
- //out = string(s,1)+'.'+string(s+1);
- }
-
- // Make final string
- if(--exp)
- {
- if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
- else out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
- }
- }
-
- mpfr_free_str(s);
- return out;
- }else{
- return "conversion error!";
- }
-#endif
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-// I/O
-inline std::ostream& mpreal::output(std::ostream& os) const
-{
- std::ostringstream format;
- const std::ios::fmtflags flags = os.flags();
-
- format << ((flags & std::ios::showpos) ? "%+" : "%");
- if (os.precision() >= 0)
- format << '.' << os.precision() << "R*"
- << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
- (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
- 'g');
- else
- format << "R*e";
-
- char *s = NULL;
- if(!(mpfr_asprintf(&s, format.str().c_str(),
- mpfr::mpreal::get_default_rnd(),
- mpfr_srcptr())
- < 0))
- {
- os << std::string(s);
- mpfr_free_str(s);
- }
- return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
-{
- return v.output(os);
-}
-
-inline std::istream& operator>>(std::istream &is, mpreal& v)
-{
- // TODO: use cout::hexfloat and other flags to setup base
- std::string tmp;
- is >> tmp;
- mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
- return is;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Bits - decimal digits relation
-// bits = ceil(digits*log[2](10))
-// digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d)
-{
- const double LOG2_10 = 3.3219280948873624;
-
- return mp_prec_t(std::ceil( d * LOG2_10 ));
-}
-
-inline int bits2digits(mp_prec_t b)
-{
- const double LOG10_2 = 0.30102999566398119;
-
- return int(std::floor( b * LOG10_2 ));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get number properties
-inline int sgn(const mpreal& op)
-{
- return mpfr_sgn(op.mpfr_srcptr());
-}
-
-inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
-{
- mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), (sign < 0 ? 1 : 0), RoundingMode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline int mpreal::getPrecision() const
-{
- return int(mpfr_get_prec(mpfr_srcptr()));
-}
-
-inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
-{
- mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::setInf(int sign)
-{
- mpfr_set_inf(mpfr_ptr(), sign);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::setNan()
-{
- mpfr_set_nan(mpfr_ptr());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::setZero(int sign)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- mpfr_set_zero(mpfr_ptr(), sign);
-#else
- mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
- setSign(sign);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mp_prec_t mpreal::get_prec() const
-{
- return mpfr_get_prec(mpfr_srcptr());
-}
-
-inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
-{
- mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mp_exp_t mpreal::get_exp ()
-{
- return mpfr_get_exp(mpfr_srcptr());
-}
-
-inline int mpreal::set_exp (mp_exp_t e)
-{
- int x = mpfr_set_exp(mpfr_ptr(), e);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return x;
-}
-
-inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
-{
- mpreal y(x);
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
- mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
-#else
- *exp = mpfr_get_exp(y.mpfr_srcptr());
- mpfr_set_exp(y.mpfr_ptr(),0);
-#endif
- return y;
-}
-
-inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
-{
- mpreal x(v);
-
- // rounding is not important since we are just increasing the exponent (= exact operation)
- mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
-{
- return ldexp(v, exp);
-}
-
-inline mpreal machine_epsilon(mp_prec_t prec)
-{
- /* the smallest eps such that 1 + eps != 1 */
- return machine_epsilon(mpreal(1, prec));
-}
-
-inline mpreal machine_epsilon(const mpreal& x)
-{
- /* the smallest eps such that x + eps != x */
- if( x < 0)
- {
- return nextabove(-x) + x;
- }else{
- return nextabove( x) - x;
- }
-}
-
-// minval is 'safe' meaning 1 / minval does not overflow
-inline mpreal minval(mp_prec_t prec)
-{
- /* min = 1/2 * 2^emin = 2^(emin - 1) */
- return mpreal(1, prec) << mpreal::get_emin()-1;
-}
-
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal maxval(mp_prec_t prec)
-{
- /* max = (1 - eps) * 2^emax, eps is machine epsilon */
- return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
-}
-
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
-{
- return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
-{
- return abs(a - b) <= eps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
-{
- return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// C++11 sign functions.
-inline mpreal copysign(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
- mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
- return rop;
-}
-
-inline bool signbit(const mpreal& x)
-{
- return mpfr_signbit(x.mpfr_srcptr());
-}
-
-inline const mpreal modf(const mpreal& v, mpreal& n)
-{
- mpreal f(v);
-
- // rounding is not important since we are using the same number
- mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
- mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
- return f;
-}
-
-inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
-{
- return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
-}
-
-inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
-{
- int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return r;
-}
-
-inline mp_exp_t mpreal::get_emin (void)
-{
- return mpfr_get_emin();
-}
-
-inline int mpreal::set_emin (mp_exp_t exp)
-{
- return mpfr_set_emin(exp);
-}
-
-inline mp_exp_t mpreal::get_emax (void)
-{
- return mpfr_get_emax();
-}
-
-inline int mpreal::set_emax (mp_exp_t exp)
-{
- return mpfr_set_emax(exp);
-}
-
-inline mp_exp_t mpreal::get_emin_min (void)
-{
- return mpfr_get_emin_min();
-}
-
-inline mp_exp_t mpreal::get_emin_max (void)
-{
- return mpfr_get_emin_max();
-}
-
-inline mp_exp_t mpreal::get_emax_min (void)
-{
- return mpfr_get_emax_min();
-}
-
-inline mp_exp_t mpreal::get_emax_max (void)
-{
- return mpfr_get_emax_max();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Mathematical Functions
-//////////////////////////////////////////////////////////////////////////
-#define MPREAL_UNARY_MATH_FUNCTION_BODY(f) \
- mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); \
- mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r); \
- return y;
-
-inline const mpreal sqr (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqr ); }
-
-inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt); }
-
-inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
-{
- mpreal y;
- mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
- return y;
-}
-
-inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
-{
- return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-}
-
-inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
-{
- if (v>=0) return sqrt(static_cast<unsigned long int>(v),rnd_mode);
- else return mpreal().setNan(); // NaN
-}
-
-inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
-{
- if (v>=0) return sqrt(static_cast<unsigned long int>(v),rnd_mode);
- else return mpreal().setNan(); // NaN
-}
-
-inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
- mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
- return y;
-}
-
-inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
- return y;
-}
-
-inline int cmpabs(const mpreal& a,const mpreal& b)
-{
- return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
-}
-
-inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
-}
-
-inline const mpreal sqrt (const long double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); }
-inline const mpreal sqrt (const double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); }
-
-inline const mpreal cbrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt ); }
-inline const mpreal fabs (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); }
-inline const mpreal abs (const mpreal& x, mp_rnd_t r) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); }
-inline const mpreal log (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log ); }
-inline const mpreal log2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log2 ); }
-inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log10); }
-inline const mpreal exp (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp ); }
-inline const mpreal exp2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 ); }
-inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp10); }
-inline const mpreal cos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cos ); }
-inline const mpreal sin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sin ); }
-inline const mpreal tan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tan ); }
-inline const mpreal sec (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sec ); }
-inline const mpreal csc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csc ); }
-inline const mpreal cot (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cot ); }
-inline const mpreal acos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acos ); }
-inline const mpreal asin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asin ); }
-inline const mpreal atan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atan ); }
-
-inline const mpreal logb (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { return log2 (abs(x),r); }
-
-inline const mpreal acot (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atan (1/v, r); }
-inline const mpreal asec (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acos (1/v, r); }
-inline const mpreal acsc (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asin (1/v, r); }
-inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atanh(1/v, r); }
-inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acosh(1/v, r); }
-inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asinh(1/v, r); }
-
-inline const mpreal cosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cosh ); }
-inline const mpreal sinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sinh ); }
-inline const mpreal tanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tanh ); }
-inline const mpreal sech (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sech ); }
-inline const mpreal csch (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csch ); }
-inline const mpreal coth (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(coth ); }
-inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acosh); }
-inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asinh); }
-inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atanh); }
-
-inline const mpreal log1p (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log1p ); }
-inline const mpreal expm1 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(expm1 ); }
-inline const mpreal eint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(eint ); }
-inline const mpreal gamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); }
-inline const mpreal tgamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); }
-inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma); }
-inline const mpreal zeta (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(zeta ); }
-inline const mpreal erf (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erf ); }
-inline const mpreal erfc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erfc ); }
-inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j0 ); }
-inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j1 ); }
-inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y0 ); }
-inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y1 ); }
-
-inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec = mpreal::get_default_prec(),
- mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(0, prec);
- mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
- return x;
-}
-
-
-inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(v);
- int tsignp;
-
- if(signp) mpfr_lgamma(x.mpfr_ptr(), signp,v.mpfr_srcptr(),rnd_mode);
- else mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
-
- return x;
-}
-
-
-inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, x.getPrecision());
- mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
- return y;
-}
-
-inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, x.getPrecision());
- mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
- return y;
-}
-
-inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t p1, p2, p3;
-
- p1 = v1.get_prec();
- p2 = v2.get_prec();
- p3 = v3.get_prec();
-
- a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
- mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t p1, p2, p3;
-
- p1 = v1.get_prec();
- p2 = v2.get_prec();
- p3 = v3.get_prec();
-
- a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
- mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t p1, p2;
-
- p1 = v1.get_prec();
- p2 = v2.get_prec();
-
- a.set_prec(p1>p2?p1:p2);
-
- mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
-
- return a;
-}
-
-inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
-{
- mpfr_srcptr *p = new mpfr_srcptr[n];
-
- for (unsigned long int i = 0; i < n; i++)
- p[i] = tab[i].mpfr_srcptr();
-
- mpreal x;
- status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
-
- delete [] p;
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
-}
-
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
- MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
-}
-
-inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- /* R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
- return fmod(x, y, rnd_mode);
-}
-
-inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- (void)rnd_mode;
-
- /*
-
- m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
-
- The following are true by convention:
- - mod(x,0) is x
- - mod(x,x) is 0
- - mod(x,y) for x != y and y != 0 has the same sign as y.
-
- */
-
- if(iszero(y)) return x;
- if(x == y) return 0;
-
- mpreal m = x - floor(x / y) * y;
-
- m.setSign(sgn(y)); // make sure result has the same sign as Y
-
- return m;
-}
-
-inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t yp, xp;
-
- yp = y.get_prec();
- xp = x.get_prec();
-
- a.set_prec(yp>xp?yp:xp);
-
- mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
-
- return a;
-}
-
-inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(v);
- mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
- return x;
-}
-#endif // MPFR 2.4.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 3.0.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(digamma); }
-inline const mpreal ai (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(ai); }
-#endif // MPFR 3.0.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// Constants
-inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_log2(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_pi(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_euler(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_catalan(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
-{
- mpreal x(0, p);
- mpfr_set_inf(x.mpfr_ptr(), sign);
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Integer Related Functions
-inline const mpreal ceil(const mpreal& v)
-{
- mpreal x(v);
- mpfr_ceil(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal floor(const mpreal& v)
-{
- mpreal x(v);
- mpfr_floor(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal round(const mpreal& v)
-{
- mpreal x(v);
- mpfr_round(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal trunc(const mpreal& v)
-{
- mpreal x(v);
- mpfr_trunc(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal rint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint ); }
-inline const mpreal rint_ceil (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil ); }
-inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor); }
-inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round); }
-inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc); }
-inline const mpreal frac (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(frac ); }
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline void swap (mpreal& a, mpreal& b) { mpfr_swap(a.mp,b.mp); }
-inline const mpreal (max)(const mpreal& x, const mpreal& y){ return (x>y?x:y); }
-inline const mpreal (min)(const mpreal& x, const mpreal& y){ return (x<y?x:y); }
-
-inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
-{
- mpreal a(x);
- mpfr_nexttoward(a.mp,y.mp);
- return a;
-}
-
-inline const mpreal nextabove (const mpreal& x)
-{
- mpreal a(x);
- mpfr_nextabove(a.mp);
- return a;
-}
-
-inline const mpreal nextbelow (const mpreal& x)
-{
- mpreal a(x);
- mpfr_nextbelow(a.mp);
- return a;
-}
-
-inline const mpreal urandomb (gmp_randstate_t& state)
-{
- mpreal x;
- mpfr_urandomb(x.mpfr_ptr(),state);
- return x;
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x;
- mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
- return x;
-}
-#endif
-
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
-{
- mpreal x;
- mpfr_random2(x.mpfr_ptr(),size,exp);
- return x;
-}
-#endif
-
-// Uniformly distributed random number generation
-// a = random(seed); <- initialization & first random number generation
-// a = random(); <- next random numbers generation
-// seed != 0
-inline const mpreal random(unsigned int seed = 0)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- static gmp_randstate_t state;
- static bool initialize = true;
-
- if(initialize)
- {
- gmp_randinit_default(state);
- gmp_randseed_ui(state,0);
- initialize = false;
- }
-
- if(seed != 0) gmp_randseed_ui(state,seed);
-
- return mpfr::urandom(state);
-#else
- if(seed != 0) std::srand(seed);
- return mpfr::mpreal(std::rand()/(double)RAND_MAX);
-#endif
-
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x;
- mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
- return x;
-}
-
-inline const mpreal grandom(unsigned int seed = 0)
-{
- static gmp_randstate_t state;
- static bool initialize = true;
-
- if(initialize)
- {
- gmp_randinit_default(state);
- gmp_randseed_ui(state,0);
- initialize = false;
- }
-
- if(seed != 0) gmp_randseed_ui(state,seed);
-
- return mpfr::grandom(state);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get global properties
-inline void mpreal::set_default_prec(mp_prec_t prec)
-{
- mpfr_set_default_prec(prec);
-}
-
-inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{
- mpfr_set_default_rounding_mode(rnd_mode);
-}
-
-inline bool mpreal::fits_in_bits(double x, int n)
-{
- int i;
- double t;
- return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
-}
-
-inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(a,static_cast<unsigned long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
-{
- return pow(a,static_cast<long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),b,rnd_mode);
- else return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),b,rnd_mode);
- else return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode);
-}
-
-// pow unsigned long int
-inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- mpreal x(a);
- mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow unsigned int
-inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow long int
-inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow int
-inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow long double
-inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-} // End of mpfr namespace
-
-// Explicit specialization of std::swap for mpreal numbers
-// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
-// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
-namespace std
-{
- // we are allowed to extend namespace std with specializations only
- template <>
- inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
- {
- return mpfr::swap(x, y);
- }
-
- template<>
- class numeric_limits<mpfr::mpreal>
- {
- public:
- static const bool is_specialized = true;
- static const bool is_signed = true;
- static const bool is_integer = false;
- static const bool is_exact = false;
- static const int radix = 2;
-
- static const bool has_infinity = true;
- static const bool has_quiet_NaN = true;
- static const bool has_signaling_NaN = true;
-
- static const bool is_iec559 = true; // = IEEE 754
- static const bool is_bounded = true;
- static const bool is_modulo = false;
- static const bool traps = true;
- static const bool tinyness_before = true;
-
- static const float_denorm_style has_denorm = denorm_absent;
-
- inline static mpfr::mpreal (min) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::minval(precision); }
- inline static mpfr::mpreal (max) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(precision); }
- inline static mpfr::mpreal lowest (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(precision); }
-
- // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
- inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(precision); }
-
- // Returns smallest eps such that x + eps != x (relative machine epsilon)
- inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) { return mpfr::machine_epsilon(x); }
-
- inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
- {
- mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
- if(r == GMP_RNDN) return mpfr::mpreal(0.5, precision);
- else return mpfr::mpreal(1.0, precision);
- }
-
- inline static const mpfr::mpreal infinity() { return mpfr::const_infinity(); }
- inline static const mpfr::mpreal quiet_NaN() { return mpfr::mpreal().setNan(); }
- inline static const mpfr::mpreal signaling_NaN() { return mpfr::mpreal().setNan(); }
- inline static const mpfr::mpreal denorm_min() { return (min)(); }
-
- // Please note, exponent range is not fixed in MPFR
- static const int min_exponent = MPFR_EMIN_DEFAULT;
- static const int max_exponent = MPFR_EMAX_DEFAULT;
- MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
- MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
-
-#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
-
- // Following members should be constant according to standard, but they can be variable in MPFR
- // So we define them as functions here.
- //
- // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
- // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
- // See below for compatible implementation.
- inline static float_round_style round_style()
- {
- mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
- switch (r)
- {
- case GMP_RNDN: return round_to_nearest;
- case GMP_RNDZ: return round_toward_zero;
- case GMP_RNDU: return round_toward_infinity;
- case GMP_RNDD: return round_toward_neg_infinity;
- default: return round_indeterminate;
- }
- }
-
- inline static int digits() { return int(mpfr::mpreal::get_default_prec()); }
- inline static int digits(const mpfr::mpreal& x) { return x.getPrecision(); }
-
- inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
- {
- return mpfr::bits2digits(precision);
- }
-
- inline static int digits10(const mpfr::mpreal& x)
- {
- return mpfr::bits2digits(x.getPrecision());
- }
-
- inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
- {
- return digits10(precision);
- }
-#else
- // Digits and round_style are NOT constants when it comes to mpreal.
- // If possible, please use functions digits() and round_style() defined above.
- //
- // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
- // Change them accordingly to your application.
- //
- // For example, if you use 256 bits of precision uniformly in your program, then:
- // digits = 256
- // digits10 = 77
- // max_digits10 = 78
- //
- // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
-
- static const std::float_round_style round_style = round_to_nearest;
- static const int digits = 53;
- static const int digits10 = 15;
- static const int max_digits10 = 16;
-#endif
- };
-
-}
-
-#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 685e7ea45..10beb0714 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -1,3 +1,4 @@
+#include <mpreal.h> // Must be included before main.h.
#include "main.h"
#include <Eigen/MPRealSupport>
#include <Eigen/LU>
@@ -7,7 +8,7 @@
using namespace mpfr;
using namespace Eigen;
-void test_mpreal_support()
+EIGEN_DECLARE_TEST(mpreal_support)
{
// set precision to 256 bits (double has only 53 bits)
mpreal::set_default_prec(256);
diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp
index 706a816f7..1c4438134 100644
--- a/unsupported/test/openglsupport.cpp
+++ b/unsupported/test/openglsupport.cpp
@@ -9,15 +9,24 @@
#include <main.h>
#include <iostream>
+#include <string>
+
+#if defined(__APPLE_CC__)
+ // Prevent deprecation warnings caused by GLEW on MacOS.
+ #define GL_SILENCE_DEPRECATION 1
+#endif
#include <GL/glew.h>
#include <Eigen/OpenGLSupport>
-#include <GL/glut.h>
-using namespace Eigen;
-
-
+#if defined(__APPLE_CC__)
+ #include <GLUT/glut.h>
+#else
+ #include <GL/freeglut.h>
+#endif
+using namespace Eigen;
#define VERIFY_MATRIX(CODE,REF) { \
+ glMatrixMode(GL_MODELVIEW); \
glLoadIdentity(); \
CODE; \
Matrix<float,4,4,ColMajor> m; m.setZero(); \
@@ -40,7 +49,7 @@ using namespace Eigen;
} \
VERIFY_IS_APPROX(value, data); \
}
-
+
#define VERIFY_UNIFORMi(NAME,TYPE) { \
TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \
TYPE data; \
@@ -53,175 +62,324 @@ using namespace Eigen;
} \
VERIFY_IS_APPROX(value, data); \
}
-
-void printInfoLog(GLuint objectID)
+
+void printProgramInfoLog(GLuint objectID)
{
int infologLength, charsWritten;
GLchar *infoLog;
- glGetProgramiv(objectID,GL_INFO_LOG_LENGTH, &infologLength);
+ glGetProgramiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
if(infologLength > 0)
{
infoLog = new GLchar[infologLength];
glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog);
- if (charsWritten>0)
+ if (charsWritten > 0)
+ std::cerr << "Program info : \n" << infoLog << std::endl;
+ delete[] infoLog;
+ }
+}
+
+void printShaderInfoLog(GLuint objectID)
+{
+ int infologLength, charsWritten;
+ GLchar *infoLog;
+ glGetShaderiv(objectID, GL_INFO_LOG_LENGTH, &infologLength);
+ if(infologLength > 0)
+ {
+ infoLog = new GLchar[infologLength];
+ glGetShaderInfoLog(objectID, infologLength, &charsWritten, infoLog);
+ if (charsWritten > 0)
std::cerr << "Shader info : \n" << infoLog << std::endl;
delete[] infoLog;
}
}
-GLint createShader(const char* vtx, const char* frg)
+GLint createProgram(const char* vtx, const char* frg, bool print_errors = true)
{
GLint prg_id = glCreateProgram();
GLint vtx_id = glCreateShader(GL_VERTEX_SHADER);
GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER);
GLint ok;
-
+
glShaderSource(vtx_id, 1, &vtx, 0);
glCompileShader(vtx_id);
- glGetShaderiv(vtx_id,GL_COMPILE_STATUS,&ok);
+ glGetShaderiv(vtx_id, GL_COMPILE_STATUS, &ok);
if(!ok)
{
- std::cerr << "vtx compilation failed\n";
+ if (print_errors)
+ {
+ std::cerr << "vtx compilation failed\n";
+ std::cerr << "Source:\n" << vtx << "\n";
+ printShaderInfoLog(vtx_id);
+ }
+ glDeleteShader(vtx_id);
+ return GL_ZERO;
}
-
+
glShaderSource(frg_id, 1, &frg, 0);
glCompileShader(frg_id);
- glGetShaderiv(frg_id,GL_COMPILE_STATUS,&ok);
+ glGetShaderiv(frg_id, GL_COMPILE_STATUS, &ok);
if(!ok)
{
- std::cerr << "frg compilation failed\n";
+ if (print_errors)
+ {
+ std::cerr << "frg compilation failed.\n";
+ std::cerr << "Source:\n" << frg << "\n";
+ printShaderInfoLog(frg_id);
+ }
+ glDeleteShader(vtx_id);
+ glDeleteShader(frg_id);
+ return GL_ZERO;
}
-
+
glAttachShader(prg_id, vtx_id);
glAttachShader(prg_id, frg_id);
glLinkProgram(prg_id);
- glGetProgramiv(prg_id,GL_LINK_STATUS,&ok);
+
+ // Delete shaders once linked.
+ glDeleteShader(vtx_id);
+ glDeleteShader(frg_id);
+ glGetProgramiv(prg_id, GL_LINK_STATUS, &ok);
if(!ok)
{
- std::cerr << "linking failed\n";
+ if (print_errors)
+ {
+ std::cerr << "linking failed.\n";
+ printProgramInfoLog(prg_id);
+ }
+ glDeleteProgram(prg_id);
+ return GL_ZERO;
}
- printInfoLog(prg_id);
-
+
glUseProgram(prg_id);
return prg_id;
}
-void test_openglsupport()
+GLint createProgram(const std::string& vtx, const std::string& frg, bool print_errors = true)
{
- int argc = 0;
- glutInit(&argc, 0);
- glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
- glutInitWindowPosition (0,0);
- glutInitWindowSize(10, 10);
+ return createProgram(vtx.c_str(), frg.c_str(), print_errors);
+}
- if(glutCreateWindow("Eigen") <= 0)
+std::string getGlslVersionString(int gl_major_version, int gl_minor_version)
+{
+ switch (gl_major_version)
{
- std::cerr << "Error: Unable to create GLUT Window.\n";
- exit(1);
+ case 2:
+ switch (gl_minor_version)
+ {
+ case 0:
+ return "#version 110";
+ case 1:
+ return "#version 120";
+ }
+ break;
+ case 3:
+ switch (gl_minor_version)
+ {
+ case 0:
+ return "#version 130";
+ case 1:
+ return "#version 140";
+ case 2:
+ return "#version 150";
+ case 3:
+ return "#version 330";
+ }
+ break;
+ case 4:
+ switch (gl_minor_version)
+ {
+ case 0:
+ return "#version 400";
+ case 1:
+ return "#version 410";
+ case 2:
+ return "#version 420";
+ case 3:
+ return "#version 430";
+ case 4:
+ return "#version 440";
+ case 5:
+ return "#version 450";
+ case 6:
+ return "#version 460";
+ }
+ break;
}
-
- glewExperimental = GL_TRUE;
- if(glewInit() != GLEW_OK)
- {
- std::cerr << "Warning: Failed to initialize GLEW\n";
+ return "";
+}
+
+void find_and_replace(
+ std::string& str,
+ const std::string& find,
+ const std::string& replace)
+{
+ size_t loc = 0;
+ size_t flen = find.length();
+ size_t rlen = replace.length();
+ while ( (loc = str.find(find, loc)) != std::string::npos) {
+ str.replace(loc, flen, replace);
+ loc += rlen;
}
+}
- Vector3f v3f;
- Matrix3f rot;
- glBegin(GL_POINTS);
-
- glVertex(v3f);
- glVertex(2*v3f+v3f);
- glVertex(rot*v3f);
-
- glEnd();
-
- // 4x4 matrices
- Matrix4f mf44; mf44.setRandom();
- VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
- VERIFY_MATRIX(glMultMatrix(mf44), mf44);
- Matrix4d md44; md44.setRandom();
- VERIFY_MATRIX(glLoadMatrix(md44), md44);
- VERIFY_MATRIX(glMultMatrix(md44), md44);
-
- // Quaternion
- Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
- VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
-
- Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
- VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
-
- // 3D Transform
- Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
- VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
- VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
-
- Transform<float,3,Affine> af3(acf3);
- VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
- VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
-
- Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
- VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
- VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
-
- Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
- VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
- VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
-
- Transform<double,3,Affine> ad3(acd3);
- VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
- VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
-
- Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
- VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
- VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
-
- // translations (2D and 3D)
- {
- Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
- VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
- Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
- VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
-
- Vector3f vf3; vf3.setRandom();
- VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
- Vector3d vd3; vd3.setRandom();
- VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
-
- Translation<float,3> tf3; tf3.vector().setRandom();
- VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
-
- Translation<double,3> td3; td3.vector().setRandom();
- VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+// Finds and replaces a set of substrings in a string.
+std::string format(
+ const std::string& str,
+ const std::vector<std::string>& find,
+ const std::vector<std::string>& replace)
+{
+ std::string out = str;
+ for (std::size_t i=0; i<find.size(); ++i) {
+ find_and_replace(out, find[i], replace[i]);
}
-
- // scaling (2D and 3D)
+ return out;
+}
+
+// GLUT display function that runs test. Must be run within the display loop
+// in order to properly destroy resources.
+void openglsupport_test_loop()
+{
+ // Get context info.
+ const GLubyte* gl_version_string = glGetString(GL_VERSION);
+ std::cerr << "GL version: " << gl_version_string << std::endl;
+ std::cerr << "GLSL version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;
+ // Parse version from string since GL_MAJOR_VERSION is only supported in GL 3.0+.
+ // Version string guaranteed to be <major>.<minor><vender extension>.
+ GLint gl_major_version = gl_version_string[0] - '0';
+ GLint gl_minor_version = gl_version_string[2] - '0';
+ bool legacy_gl = gl_major_version < 3 || (gl_major_version == 3 && gl_minor_version < 2);
+
+ // Fixed-function pipeline removed in OpenGL 3.2.
+ if (legacy_gl)
{
- Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
- VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
- Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
- VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
-
- Vector3f vf3; vf3.setRandom();
- VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
- Vector3d vd3; vd3.setRandom();
- VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
-
- UniformScaling<float> usf(internal::random<float>());
- VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
-
- UniformScaling<double> usd(internal::random<double>());
- VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+ // Draw a basic triangle.
+ Vector3f v3f;
+ Matrix3f rot;
+ glBegin(GL_POINTS);
+ {
+ glVertex(v3f);
+ glVertex(2*v3f+v3f);
+ glVertex(rot*v3f);
+ }
+ glEnd();
+
+ // 4x4 matrices
+ Matrix4f mf44; mf44.setRandom();
+ VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
+ VERIFY_MATRIX(glMultMatrix(mf44), mf44);
+ Matrix4d md44; md44.setRandom();
+ VERIFY_MATRIX(glLoadMatrix(md44), md44);
+ VERIFY_MATRIX(glMultMatrix(md44), md44);
+
+ // Quaternion
+ Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
+ VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
+
+ Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
+ VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
+
+ // 3D Transform
+ Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
+ VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
+ VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
+
+ Transform<float,3,Affine> af3(acf3);
+ VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
+ VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
+
+ Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
+ VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
+ VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
+
+ Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
+ VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
+ VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
+
+ Transform<double,3,Affine> ad3(acd3);
+ VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
+ VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
+
+ Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
+ VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
+ VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
+
+ // translations (2D and 3D)
+ {
+ Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
+ VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
+ Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
+ VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
+
+ Vector3f vf3; vf3.setRandom();
+ VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
+ Vector3d vd3; vd3.setRandom();
+ VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
+
+ Translation<float,3> tf3; tf3.vector().setRandom();
+ VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
+
+ Translation<double,3> td3; td3.vector().setRandom();
+ VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+ }
+
+ // scaling (2D and 3D)
+ {
+ Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
+ VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
+ Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
+ VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
+
+ Vector3f vf3; vf3.setRandom();
+ VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
+ Vector3d vd3; vd3.setRandom();
+ VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
+
+ UniformScaling<float> usf(internal::random<float>());
+ VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
+
+ UniformScaling<double> usd(internal::random<double>());
+ VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+ }
+ } else {
+ std::cerr << "Warning: fixed-function pipeline was not tested.\n";
+ }
+
+ // Dynamic shader substitution variables.
+ // Modern shaders require a version string, and newer runtimes fail to
+ // compile old GLSL versions. Thus, we dynamically set the GLSL version
+ // string based on runtime. Also, pre OpenGL 3.0, the output gl_FragColor was
+ // built-in. This was deprecated in OpenGL 3.0, requiring us to explicitly
+ // define the output variable.
+ std::vector<std::string> glsl_vars;
+ glsl_vars.push_back("${GLSL_VERSION}");
+ glsl_vars.push_back("${FRAG_OUTPUT_DECLARATION}");
+ glsl_vars.push_back("${FRAG_OUTPUT_VARIABLE}");
+
+ std::vector<std::string> glsl_vals;
+ glsl_vals.push_back(getGlslVersionString(gl_major_version, gl_minor_version));
+ if (gl_major_version >= 3) {
+ glsl_vals.push_back("out vec4 fragColor;");
+ glsl_vals.push_back("fragColor");
+ } else {
+ glsl_vals.push_back("");
+ glsl_vals.push_back("gl_FragColor");
}
-
+
// uniform
{
- const char* vtx = "void main(void) { gl_Position = gl_Vertex; }\n";
-
- if(GLEW_VERSION_2_0)
+ // vertex shader.
+ std::string vtx = format(
+ "${GLSL_VERSION}\n"
+ "void main(void) {\n"
+ " gl_Position = vec4(0,0,0,1);\n"
+ "}\n",
+ glsl_vars, glsl_vals);
+
+#ifdef GL_VERSION_2_0
+ if(GLEW_VERSION_2_0 && GL_VERSION_2_0)
{
- #ifdef GL_VERSION_2_0
- const char* frg = ""
+ std::string frg = format(
+ "${GLSL_VERSION}\n"
"uniform vec2 v2f;\n"
"uniform vec3 v3f;\n"
"uniform vec4 v4f;\n"
@@ -231,107 +389,212 @@ void test_openglsupport()
"uniform mat2 m2f;\n"
"uniform mat3 m3f;\n"
"uniform mat4 m4f;\n"
- "void main(void) { gl_FragColor = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]); }\n";
-
- GLint prg_id = createShader(vtx,frg);
-
- VERIFY_UNIFORM(fv,v2f, Vector2f);
- VERIFY_UNIFORM(fv,v3f, Vector3f);
- VERIFY_UNIFORM(fv,v4f, Vector4f);
+ "${FRAG_OUTPUT_DECLARATION}\n"
+ "void main(void) { \n"
+ " ${FRAG_OUTPUT_VARIABLE} = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]);\n"
+ "}\n",
+ glsl_vars, glsl_vals);
+
+ GLint prg_id = createProgram(vtx, frg);
+ VERIFY(prg_id > 0 && "Failed to create program.");
+ VERIFY_UNIFORM(fv, v2f, Vector2f);
+ VERIFY_UNIFORM(fv, v3f, Vector3f);
+ VERIFY_UNIFORM(fv, v4f, Vector4f);
VERIFY_UNIFORMi(v2i, Vector2i);
VERIFY_UNIFORMi(v3i, Vector3i);
VERIFY_UNIFORMi(v4i, Vector4i);
- VERIFY_UNIFORM(fv,m2f, Matrix2f);
- VERIFY_UNIFORM(fv,m3f, Matrix3f);
- VERIFY_UNIFORM(fv,m4f, Matrix4f);
- #endif
+ VERIFY_UNIFORM(fv, m2f, Matrix2f);
+ VERIFY_UNIFORM(fv, m3f, Matrix3f);
+ VERIFY_UNIFORM(fv, m4f, Matrix4f);
+ glDeleteProgram(prg_id);
}
else
- std::cerr << "Warning: opengl 2.0 was not tested\n";
-
- if(GLEW_VERSION_2_1)
+#endif
+ std::cerr << "Warning: opengl 2.0 was not tested.\n";
+
+#ifdef GL_VERSION_2_1
+ if(GLEW_VERSION_2_1 && GL_VERSION_2_1 &&
+ (gl_major_version > 2 || (gl_major_version == 2 && gl_minor_version >= 1)))
{
- #ifdef GL_VERSION_2_1
- const char* frg = "#version 120\n"
+ std::string frg = format(
+ "${GLSL_VERSION}\n"
"uniform mat2x3 m23f;\n"
"uniform mat3x2 m32f;\n"
"uniform mat2x4 m24f;\n"
"uniform mat4x2 m42f;\n"
"uniform mat3x4 m34f;\n"
"uniform mat4x3 m43f;\n"
- "void main(void) { gl_FragColor = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]); }\n";
-
- GLint prg_id = createShader(vtx,frg);
-
+ "${FRAG_OUTPUT_DECLARATION}\n"
+ "void main(void) {\n"
+ " ${FRAG_OUTPUT_VARIABLE} = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]);\n"
+ "}\n",
+ glsl_vars, glsl_vals);
+
+ GLint prg_id = createProgram(vtx, frg);
+ VERIFY(prg_id > 0 && "Failed to create program.");
typedef Matrix<float,2,3> Matrix23f;
typedef Matrix<float,3,2> Matrix32f;
typedef Matrix<float,2,4> Matrix24f;
typedef Matrix<float,4,2> Matrix42f;
typedef Matrix<float,3,4> Matrix34f;
typedef Matrix<float,4,3> Matrix43f;
-
- VERIFY_UNIFORM(fv,m23f, Matrix23f);
- VERIFY_UNIFORM(fv,m32f, Matrix32f);
- VERIFY_UNIFORM(fv,m24f, Matrix24f);
- VERIFY_UNIFORM(fv,m42f, Matrix42f);
- VERIFY_UNIFORM(fv,m34f, Matrix34f);
- VERIFY_UNIFORM(fv,m43f, Matrix43f);
- #endif
+
+ VERIFY_UNIFORM(fv, m23f, Matrix23f);
+ VERIFY_UNIFORM(fv, m32f, Matrix32f);
+ VERIFY_UNIFORM(fv, m24f, Matrix24f);
+ VERIFY_UNIFORM(fv, m42f, Matrix42f);
+ VERIFY_UNIFORM(fv, m34f, Matrix34f);
+ VERIFY_UNIFORM(fv, m43f, Matrix43f);
+ glDeleteProgram(prg_id);
}
else
- std::cerr << "Warning: opengl 2.1 was not tested\n";
-
- if(GLEW_VERSION_3_0)
+#endif
+ std::cerr << "Warning: opengl 2.1 was not tested.\n";
+
+#ifdef GL_VERSION_3_0
+ if(GLEW_VERSION_3_0 && GL_VERSION_3_0 && gl_major_version >= 3)
{
- #ifdef GL_VERSION_3_0
- const char* frg = "#version 150\n"
+ std::string frg = format(
+ "${GLSL_VERSION}\n"
"uniform uvec2 v2ui;\n"
"uniform uvec3 v3ui;\n"
"uniform uvec4 v4ui;\n"
- "out vec4 data;\n"
- "void main(void) { data = vec4(v2ui[0]+v3ui[0]+v4ui[0]); }\n";
-
- GLint prg_id = createShader(vtx,frg);
-
+ "${FRAG_OUTPUT_DECLARATION}\n"
+ "void main(void) {\n"
+ " ${FRAG_OUTPUT_VARIABLE} = vec4(v2ui[0]+v3ui[0]+v4ui[0]);\n"
+ "}\n",
+ glsl_vars, glsl_vals);
+
+ GLint prg_id = createProgram(vtx, frg);
+ VERIFY(prg_id > 0 && "Failed to create program.");
typedef Matrix<unsigned int,2,1> Vector2ui;
typedef Matrix<unsigned int,3,1> Vector3ui;
typedef Matrix<unsigned int,4,1> Vector4ui;
-
+
VERIFY_UNIFORMi(v2ui, Vector2ui);
VERIFY_UNIFORMi(v3ui, Vector3ui);
VERIFY_UNIFORMi(v4ui, Vector4ui);
- #endif
+ glDeleteProgram(prg_id);
}
else
- std::cerr << "Warning: opengl 3.0 was not tested\n";
-
- #ifdef GLEW_ARB_gpu_shader_fp64
+#endif
+ std::cerr << "Warning: opengl 3.0 was not tested.\n";
+
+ // dvecn supported if >= 4.1 or ARB_vertex_attrib_64bit
+ bool has_fp64_native = (gl_major_version == 4 && gl_minor_version >= 1);
+ bool has_fp64_extension = false;
+#ifdef GLEW_ARB_gpu_shader_fp64
if(GLEW_ARB_gpu_shader_fp64)
{
- #ifdef GL_ARB_gpu_shader_fp64
- const char* frg = "#version 150\n"
+ // Check that extension can actually be compiled.
+ if (has_fp64_extension)
+ {
+ std::string frg = format(
+ "${GLSL_VERSION}\n"
+ "#extension GL_ARB_gpu_shader_fp64 : enable\n"
+ "uniform dvec2 dv2;\n"
+ "${FRAG_OUTPUT_DECLARATION}\n"
+ "void main(void) {\n"
+ " ${FRAG_OUTPUT_VARIABLE} = vec4(dv2.x, dv2.y, dv2.x, dv2.y);\n"
+ "}\n",
+ glsl_vars, glsl_vals);
+ GLint prg_id = createProgram(vtx, frg, /*print_errors=*/false);
+ if (prg_id)
+ {
+ has_fp64_extension = true;
+ glDeleteProgram(prg_id);
+ }
+ }
+ }
+#endif
+
+ if( has_fp64_native || has_fp64_extension )
+ {
+ std::vector<std::string> glsl_vars_with_extension = glsl_vars;
+ glsl_vars_with_extension.push_back("${GLSL_EXTENSIONS}");
+ std::vector<std::string> glsl_vals_with_extension = glsl_vals;
+ if (has_fp64_extension)
+ {
+ glsl_vals_with_extension.push_back("#extension GL_ARB_gpu_shader_fp64 : enable");
+ }
+ else
+ {
+ glsl_vals_with_extension.push_back("");
+ }
+
+ std::string frg = format(
+ "${GLSL_VERSION}\n"
+ "${GLSL_EXTENSIONS}\n"
"uniform dvec2 v2d;\n"
"uniform dvec3 v3d;\n"
"uniform dvec4 v4d;\n"
- "out vec4 data;\n"
- "void main(void) { data = vec4(v2d[0]+v3d[0]+v4d[0]); }\n";
-
- GLint prg_id = createShader(vtx,frg);
-
- typedef Vector2d Vector2d;
- typedef Vector3d Vector3d;
- typedef Vector4d Vector4d;
-
- VERIFY_UNIFORM(dv,v2d, Vector2d);
- VERIFY_UNIFORM(dv,v3d, Vector3d);
- VERIFY_UNIFORM(dv,v4d, Vector4d);
- #endif
+ "${FRAG_OUTPUT_DECLARATION}\n"
+ "void main(void) {\n"
+ " ${FRAG_OUTPUT_VARIABLE} = vec4(v2d[0]+v3d[0]+v4d[0]);\n"
+ "}\n",
+ glsl_vars_with_extension, glsl_vals_with_extension);
+
+ GLint prg_id = createProgram(vtx,frg);
+ VERIFY(prg_id > 0 && "Failed to create program.");
+ VERIFY_UNIFORM(dv, v2d, Vector2d);
+ VERIFY_UNIFORM(dv, v3d, Vector3d);
+ VERIFY_UNIFORM(dv, v4d, Vector4d);
+ glDeleteProgram(prg_id);
}
else
- std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
- #else
- std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
- #endif
+ std::cerr << "Warning: dvec (fp64) was not tested.\n";
}
-
+
+ // Exit loop - Leaving main loop is supported by freeglut, otherwise we
+ // are forced to exit.
+#ifdef FREEGLUT
+ glutLeaveMainLoop();
+ // Trigger another display loop iteration. Otherwise, it just hangs.
+ glutPostRedisplay();
+#else
+ exit(0);
+#endif
+}
+
+EIGEN_DECLARE_TEST(openglsupport)
+{
+ int argc = 0;
+ glutInit(&argc, 0);
+
+ GLint glut_display_mode = GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH;
+
+#ifndef EIGEN_LEGACY_OPENGL
+ // Initialize 3.2+ OpenGL context.
+#if defined(__APPLE_CC__)
+ glut_display_mode |= GLUT_3_2_CORE_PROFILE;
+#elif defined(FREEGLUT)
+ glutInitContextVersion(3, 2);
+ glutInitContextFlags(GLUT_FORWARD_COMPATIBLE);
+ glutInitContextProfile(GLUT_CORE_PROFILE);
+#endif
+#endif
+
+ glutInitDisplayMode(glut_display_mode);
+ glutInitWindowPosition(0, 0);
+ glutInitWindowSize(10, 10);
+
+ int window = glutCreateWindow("Eigen");
+ if(window <= 0)
+ {
+ std::cerr << "Error: Unable to create GLUT Window.\n";
+ exit(1);
+ }
+
+ glewExperimental = GL_TRUE;
+ if(glewInit() != GLEW_OK)
+ {
+ std::cerr << "Warning: Failed to initialize GLEW.\n";
+ exit(1);
+ }
+
+ // Run test in display, otherwise GLUT fails to clean up and leads to memory
+ // access errors on exit.
+ glutDisplayFunc(openglsupport_test_loop);
+ glutMainLoop();
+ glutDestroyWindow(window);
}
diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp
index 0c87478dd..4ff9bda5a 100644
--- a/unsupported/test/polynomialsolver.cpp
+++ b/unsupported/test/polynomialsolver.cpp
@@ -26,15 +26,25 @@ struct increment_if_fixed_size
}
}
+template<typename PolynomialType>
+PolynomialType polyder(const PolynomialType& p)
+{
+ typedef typename PolynomialType::Scalar Scalar;
+ PolynomialType res(p.size());
+ for(Index i=1; i<p.size(); ++i)
+ res[i-1] = p[i]*Scalar(i);
+ res[p.size()-1] = 0.;
+ return res;
+}
template<int Deg, typename POLYNOMIAL, typename SOLVER>
bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
{
- typedef typename POLYNOMIAL::Index Index;
typedef typename POLYNOMIAL::Scalar Scalar;
+ typedef typename POLYNOMIAL::RealScalar RealScalar;
typedef typename SOLVER::RootsType RootsType;
- typedef Matrix<Scalar,Deg,1> EvalRootsType;
+ typedef Matrix<RealScalar,Deg,1> EvalRootsType;
const Index deg = pols.size()-1;
@@ -44,10 +54,17 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
psolve.compute( pols );
const RootsType& roots( psolve.roots() );
EvalRootsType evr( deg );
+ POLYNOMIAL pols_der = polyder(pols);
+ EvalRootsType der( deg );
for( int i=0; i<roots.size(); ++i ){
- evr[i] = std::abs( poly_eval( pols, roots[i] ) ); }
+ evr[i] = std::abs( poly_eval( pols, roots[i] ) );
+ der[i] = numext::maxi(RealScalar(1.), std::abs( poly_eval( pols_der, roots[i] ) ));
+ }
- bool evalToZero = evr.isZero( test_precision<Scalar>() );
+ // we need to divide by the magnitude of the derivative because
+ // with a high derivative is very small error in the value of the root
+ // yiels a very large error in the polynomial evaluation.
+ bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision<Scalar>() );
if( !evalToZero )
{
cerr << "WRONG root: " << endl;
@@ -57,7 +74,7 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
cerr << endl;
}
- std::vector<Scalar> rootModuli( roots.size() );
+ std::vector<RealScalar> rootModuli( roots.size() );
Map< EvalRootsType > aux( &rootModuli[0], roots.size() );
aux = roots.array().abs();
std::sort( rootModuli.begin(), rootModuli.end() );
@@ -83,7 +100,7 @@ void evalSolver( const POLYNOMIAL& pols )
{
typedef typename POLYNOMIAL::Scalar Scalar;
- typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
+ typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
PolynomialSolverType psolve;
aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve );
@@ -97,6 +114,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
{
using std::sqrt;
typedef typename POLYNOMIAL::Scalar Scalar;
+ typedef typename POLYNOMIAL::RealScalar RealScalar;
typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
@@ -107,15 +125,12 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
// 1) the roots found are correct
// 2) the roots have distinct moduli
- typedef typename POLYNOMIAL::Scalar Scalar;
- typedef typename REAL_ROOTS::Scalar Real;
-
//Test realRoots
- std::vector< Real > calc_realRoots;
- psolve.realRoots( calc_realRoots );
- VERIFY( calc_realRoots.size() == (size_t)real_roots.size() );
+ std::vector< RealScalar > calc_realRoots;
+ psolve.realRoots( calc_realRoots, test_precision<RealScalar>());
+ VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() );
- const Scalar psPrec = sqrt( test_precision<Scalar>() );
+ const RealScalar psPrec = sqrt( test_precision<RealScalar>() );
for( size_t i=0; i<calc_realRoots.size(); ++i )
{
@@ -138,7 +153,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
bool hasRealRoot;
//Test absGreatestRealRoot
- Real r = psolve.absGreatestRealRoot( hasRealRoot );
+ RealScalar r = psolve.absGreatestRealRoot( hasRealRoot );
VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
if( hasRealRoot ){
VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) ); }
@@ -167,9 +182,11 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
template<typename _Scalar, int _Deg>
void polynomialsolver(int deg)
{
- typedef internal::increment_if_fixed_size<_Deg> Dim;
+ typedef typename NumTraits<_Scalar>::Real RealScalar;
+ typedef internal::increment_if_fixed_size<_Deg> Dim;
typedef Matrix<_Scalar,Dim::ret,1> PolynomialType;
typedef Matrix<_Scalar,_Deg,1> EvalRootsType;
+ typedef Matrix<RealScalar,_Deg,1> RealRootsType;
cout << "Standard cases" << endl;
PolynomialType pols = PolynomialType::Random(deg+1);
@@ -182,19 +199,15 @@ void polynomialsolver(int deg)
evalSolver<_Deg,PolynomialType>( pols );
cout << "Test sugar" << endl;
- EvalRootsType realRoots = EvalRootsType::Random(deg);
+ RealRootsType realRoots = RealRootsType::Random(deg);
roots_to_monicPolynomial( realRoots, pols );
evalSolverSugarFunction<_Deg>(
pols,
- realRoots.template cast <
- std::complex<
- typename NumTraits<_Scalar>::Real
- >
- >(),
+ realRoots.template cast <std::complex<RealScalar> >().eval(),
realRoots );
}
-void test_polynomialsolver()
+EIGEN_DECLARE_TEST(polynomialsolver)
{
for(int i = 0; i < g_repeat; i++)
{
@@ -214,5 +227,6 @@ void test_polynomialsolver()
internal::random<int>(9,13)
)) );
CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
+ CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) );
}
}
diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp
index 5fc968402..8ff451996 100644
--- a/unsupported/test/polynomialutils.cpp
+++ b/unsupported/test/polynomialutils.cpp
@@ -101,7 +101,7 @@ template<typename _Scalar> void CauchyBounds_scalar()
internal::random<int>(18,26) )) );
}
-void test_polynomialutils()
+EIGEN_DECLARE_TEST(polynomialutils)
{
for(int i = 0; i < g_repeat; i++)
{
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index a010ceb93..602c2cb84 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -8,10 +8,45 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-// import basic and product tests for deprectaed DynamicSparseMatrix
+// import basic and product tests for deprecated DynamicSparseMatrix
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+static long g_realloc_count = 0;
+#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++;
+
+static long g_dense_op_sparse_count = 0;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10;
+#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20;
+
+#define EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA 1
+#endif
+
#define EIGEN_NO_DEPRECATED_WARNING
-#include "sparse_basic.cpp"
+// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix)
+// has an extra copy-assignment.
+#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT
#include "sparse_product.cpp"
+
+#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled
+#include "sparse_basic.cpp"
+#endif
+
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+
#include <Eigen/SparseExtra>
template<typename SetterType,typename DenseType, typename Scalar, int Options>
@@ -104,10 +139,8 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
#ifdef EIGEN_UNORDERED_MAP_SUPPORT
VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
#endif
- #ifdef _DENSE_HASH_MAP_H_
+ #ifdef EIGEN_GOOGLEHASH_SUPPORT
VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
- #endif
- #ifdef _SPARSE_HASH_MAP_H_
VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
#endif
@@ -129,7 +162,32 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
}
-void test_sparse_extra()
+
+template<typename SparseMatrixType>
+void check_marketio()
+{
+ typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix;
+ Index rows = internal::random<Index>(1,100);
+ Index cols = internal::random<Index>(1,100);
+ SparseMatrixType m1, m2;
+ m1 = DenseMatrix::Random(rows, cols).sparseView();
+ saveMarket(m1, "sparse_extra.mtx");
+ loadMarket(m2, "sparse_extra.mtx");
+ VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
+}
+
+template<typename VectorType>
+void check_marketio_vector()
+{
+ Index size = internal::random<Index>(1,100);
+ VectorType v1, v2;
+ v1 = VectorType::Random(size);
+ saveMarketVector(v1, "vector_extra.mtx");
+ loadMarketVector(v2, "vector_extra.mtx");
+ VERIFY_IS_EQUAL(v1,v2);
+}
+
+EIGEN_DECLARE_TEST(sparse_extra)
{
for(int i = 0; i < g_repeat; i++) {
int s = Eigen::internal::random<int>(1,50);
@@ -143,5 +201,26 @@ void test_sparse_extra()
CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
+
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
+ CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
+
+
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) );
+ CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) );
+
+ TEST_SET_BUT_UNUSED_VARIABLE(s);
}
}
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 057fb3e92..589bb76e1 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -7,9 +7,21 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <limits.h>
#include "main.h"
#include "../Eigen/SpecialFunctions"
+// Hack to allow "implicit" conversions from double to Scalar via comma-initialization.
+template<typename Derived>
+Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) {
+ return (dense << static_cast<typename Derived::Scalar>(v));
+}
+
+template<typename XprType>
+Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) {
+ return (ci, static_cast<typename XprType::Scalar>(v));
+}
+
template<typename X, typename Y>
void verify_component_wise(const X& x, const Y& y)
{
@@ -64,8 +76,8 @@ template<typename ArrayType> void array_special_functions()
// igamma(a, x) = gamma(a, x) / Gamma(a)
// where Gamma and gamma are considered the standard unnormalized
// upper and lower incomplete gamma functions, respectively.
- ArrayType a = m1.abs() + 2;
- ArrayType x = m2.abs() + 2;
+ ArrayType a = m1.abs() + Scalar(2);
+ ArrayType x = m2.abs() + Scalar(2);
ArrayType zero = ArrayType::Zero(rows, cols);
ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
ArrayType a_m1 = a - one;
@@ -74,6 +86,7 @@ template<typename ArrayType> void array_special_functions()
ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
// Gamma(a, 0) == Gamma(a)
VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
@@ -81,10 +94,23 @@ template<typename ArrayType> void array_special_functions()
VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
// Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
- VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+ VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp());
// gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
- VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+ VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp());
+ }
+ {
+ // Verify for large a and x that values are between 0 and 1.
+ ArrayType m1 = ArrayType::Random(rows,cols);
+ ArrayType m2 = ArrayType::Random(rows,cols);
+ int max_exponent = std::numeric_limits<Scalar>::max_exponent10;
+ ArrayType a = m1.abs() * Scalar(pow(10., max_exponent - 1));
+ ArrayType x = m2.abs() * Scalar(pow(10., max_exponent - 1));
+ for (int i = 0; i < a.size(); ++i) {
+ Scalar igam = numext::igamma(a(i), x(i));
+ VERIFY(0 <= igam);
+ VERIFY(igam <= 1);
+ }
}
{
@@ -93,27 +119,37 @@ template<typename ArrayType> void array_special_functions()
Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
// location i*6+j corresponds to a_s[i], x_s[j].
- Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
- {0.0, 0.6321205588285578, 0.7768698398515702,
- 0.9816843611112658, 9.999500016666262e-05, 1.0},
- {0.0, 0.4275932955291202, 0.608374823728911,
- 0.9539882943107686, 7.522076445089201e-07, 1.0},
- {0.0, 0.01898815687615381, 0.06564245437845008,
- 0.5665298796332909, 4.166333347221828e-18, 1.0},
- {0.0, 0.9999780593618628, 0.9999899967080838,
- 0.9999996219837988, 0.9991370418689945, 1.0},
- {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
- Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
- {1.0, 0.36787944117144233, 0.22313016014842982,
- 0.018315638888734182, 0.9999000049998333, 0.0},
- {1.0, 0.5724067044708798, 0.3916251762710878,
- 0.04601170568923136, 0.9999992477923555, 0.0},
- {1.0, 0.9810118431238462, 0.9343575456215499,
- 0.4334701203667089, 1.0, 0.0},
- {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
- 3.7801620118431334e-07, 0.0008629581310054535,
- 0.0},
- {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+ Scalar igamma_s[][6] = {
+ {Scalar(0.0), nan, nan, nan, nan, nan},
+ {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702),
+ Scalar(0.9816843611112658), Scalar(9.999500016666262e-05),
+ Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911),
+ Scalar(0.9539882943107686), Scalar(7.522076445089201e-07),
+ Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.01898815687615381),
+ Scalar(0.06564245437845008), Scalar(0.5665298796332909),
+ Scalar(4.166333347221828e-18), Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838),
+ Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)},
+ {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0),
+ Scalar(0.5042041932513908)}};
+ Scalar igammac_s[][6] = {
+ {nan, nan, nan, nan, nan, nan},
+ {Scalar(1.0), Scalar(0.36787944117144233),
+ Scalar(0.22313016014842982), Scalar(0.018315638888734182),
+ Scalar(0.9999000049998333), Scalar(0.0)},
+ {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878),
+ Scalar(0.04601170568923136), Scalar(0.9999992477923555),
+ Scalar(0.0)},
+ {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499),
+ Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)},
+ {Scalar(1.0), Scalar(2.1940638138146658e-05),
+ Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07),
+ Scalar(0.0008629581310054535), Scalar(0.0)},
+ {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0),
+ Scalar(0.49579580674813944)}};
+
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if ((std::isnan)(igamma_s[i][j])) {
@@ -133,12 +169,32 @@ template<typename ArrayType> void array_special_functions()
}
#endif // EIGEN_HAS_C99_MATH
+ // Check the ndtri function against scipy.special.ndtri
+ {
+ ArrayType x(7), res(7), ref(7);
+ x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
+ ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+ CALL_SUBTEST( verify_component_wise(ref, ref); );
+ CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
+ CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
+
+ // ndtri(normal_cdf(x)) ~= x
+ CALL_SUBTEST(
+ ArrayType m1 = ArrayType::Random(32);
+ using std::sqrt;
+
+ ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf();
+ cdf_val = (cdf_val + Scalar(1)) / Scalar(2);
+ verify_component_wise(cdf_val.ndtri(), m1););
+
+ }
+
// Check the zeta function against scipy.special.zeta
{
- ArrayType x(7), q(7), res(7), ref(7);
- x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9;
- q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345;
- ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+ ArrayType x(10), q(10), res(10), ref(10);
+ x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9, 2, 3, 4;
+ q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3;
+ ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf;
CALL_SUBTEST( verify_component_wise(ref, ref); );
CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
@@ -146,22 +202,21 @@ template<typename ArrayType> void array_special_functions()
// digamma
{
- ArrayType x(7), res(7), ref(7);
- x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
- ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+ ArrayType x(9), res(9), ref(9);
+ x << 1, 1.5, 4, -10.5, 10000.5, 0, -1, -2, -3;
+ ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, nan, nan, nan, nan;
CALL_SUBTEST( verify_component_wise(ref, ref); );
CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
CALL_SUBTEST( res = digamma(x); verify_component_wise(res, ref); );
}
-
#if EIGEN_HAS_C99_MATH
{
- ArrayType n(11), x(11), res(11), ref(11);
- n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170;
- x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
- ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+ ArrayType n(16), x(16), res(16), ref(16);
+ n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170, -1, 0, 1, 2, 3;
+ x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64, -1, -2, -3, -4, -5;
+ ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927, nan, nan, plusinf, nan, plusinf;
CALL_SUBTEST( verify_component_wise(ref, ref); );
if(sizeof(RealScalar)>=8) { // double
@@ -288,8 +343,8 @@ template<typename ArrayType> void array_special_functions()
ArrayType m3 = ArrayType::Random(32);
ArrayType one = ArrayType::Constant(32, Scalar(1.0));
const Scalar eps = std::numeric_limits<Scalar>::epsilon();
- ArrayType a = (m1 * 4.0).exp();
- ArrayType b = (m2 * 4.0).exp();
+ ArrayType a = (m1 * Scalar(4)).exp();
+ ArrayType b = (m2 * Scalar(4)).exp();
ArrayType x = m3.abs();
// betainc(a, 1, x) == x**a
@@ -335,11 +390,108 @@ template<typename ArrayType> void array_special_functions()
ArrayType test = betainc(a, b + one, x) + eps;
verify_component_wise(test, expected););
}
-#endif
+#endif // EIGEN_HAS_C99_MATH
+
+ /* Code to generate the data for the following two test cases.
+ N = 5
+ np.random.seed(3)
+
+ a = np.logspace(-2, 3, 6)
+ a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N]))
+ x = np.random.gamma(a, 1.0)
+ x = np.maximum(x, np.finfo(np.float32).tiny)
+
+ def igamma(a, x):
+ return mpmath.gammainc(a, 0, x, regularized=True)
+
+ def igamma_der_a(a, x):
+ res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a)
+ return np.float64(res)
+
+ def gamma_sample_der_alpha(a, x):
+ igamma_x = igamma(a, x)
+ def igammainv_of_igamma(a_prime):
+ return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) -
+ igamma_x, x, solver='newton')
+ return np.float64(mpmath.diff(igammainv_of_igamma, a))
+
+ v_igamma_der_a = np.vectorize(igamma_der_a)(a, x)
+ v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
+ */
+
+#if EIGEN_HAS_C99_MATH
+ // Test igamma_der_a
+ {
+ ArrayType a(30);
+ ArrayType x(30);
+ ArrayType res(30);
+ ArrayType v(30);
+
+ a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+ 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+ x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+ 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+ 0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+ 0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+ 0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+ 10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+ 86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+ 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+ v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514,
+ -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596,
+ -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323,
+ -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522,
+ -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294,
+ -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921,
+ -0.00794899153653, -0.00777303219211, -0.00796085782042,
+ -0.0125850719397, -0.00455500206958, -0.00476436993148;
+
+ CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v););
+ }
+
+ // Test gamma_sample_der_alpha
+ {
+ ArrayType alpha(30);
+ ArrayType sample(30);
+ ArrayType res(30);
+ ArrayType v(30);
+
+ alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
+ 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
+
+ sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
+ 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16,
+ 0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06,
+ 0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426,
+ 0.786686768458, 7.63873279537, 13.1944344379, 11.896042354,
+ 10.5830172417, 10.5020942233, 92.8918587747, 95.003720371,
+ 86.3715926467, 96.0330217672, 82.6389930677, 968.702906754,
+ 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
+
+ v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
+ 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
+ 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
+ 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
+ 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
+ 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
+ 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
+ 1.00106492525, 0.97734200649, 1.02198794179;
+
+ CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
+ verify_component_wise(res, v););
+ }
+#endif // EIGEN_HAS_C99_MATH
}
-void test_special_functions()
+EIGEN_DECLARE_TEST(special_functions)
{
CALL_SUBTEST_1(array_special_functions<ArrayXf>());
CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+ // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above.
+ // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>());
+ // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>());
}
diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
new file mode 100644
index 000000000..31233f1b0
--- /dev/null
+++ b/unsupported/test/special_packetmath.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <limits>
+#include "packetmath_test_shared.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename Scalar,typename Packet> void packetmath_real()
+{
+ using std::abs;
+ typedef internal::packet_traits<Scalar> PacketTraits;
+ const int PacketSize = internal::unpacket_traits<Packet>::size;
+
+ const int size = PacketSize*4;
+ EIGEN_ALIGN_MAX Scalar data1[PacketSize*4];
+ EIGEN_ALIGN_MAX Scalar data2[PacketSize*4];
+ EIGEN_ALIGN_MAX Scalar ref[PacketSize*4];
+
+#if EIGEN_HAS_C99_MATH
+ {
+ data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+ test::packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h;
+ h.store(data2, internal::plgamma(h.load(data1)));
+ VERIFY((numext::isnan)(data2[0]));
+ }
+ if (internal::packet_traits<Scalar>::HasErf) {
+ data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+ test::packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h;
+ h.store(data2, internal::perf(h.load(data1)));
+ VERIFY((numext::isnan)(data2[0]));
+ }
+ {
+ data1[0] = std::numeric_limits<Scalar>::quiet_NaN();
+ test::packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h;
+ h.store(data2, internal::perfc(h.load(data1)));
+ VERIFY((numext::isnan)(data2[0]));
+ }
+ {
+ for (int i=0; i<size; ++i) {
+ data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1));
+ }
+ CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri);
+ }
+#endif // EIGEN_HAS_C99_MATH
+
+ // For bessel_i*e and bessel_j*, the valid range is negative reals.
+ {
+ const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6);
+ for (int i=0; i<size; ++i)
+ {
+ data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+ data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent))));
+ }
+
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1);
+ }
+
+ // Use a smaller data range for the bessel_i* as these can become very large.
+ // Following #1693, we also restrict this range further to avoid inf's due to
+ // differences in pexp and exp.
+ for (int i=0; i<size; ++i) {
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ }
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1);
+
+
+ // y_i, and k_i are valid for x > 0.
+ {
+ const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5);
+ for (int i=0; i<size; ++i)
+ {
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent))));
+ }
+ }
+
+ // TODO(srvasude): Re-enable this test once properly investigated why the
+ // scalar and vector paths differ.
+ // CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y0, internal::pbessel_y0);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y1, internal::pbessel_y1);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0e, internal::pbessel_k0e);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1e, internal::pbessel_k1e);
+
+ // Following #1693, we restrict the range for exp to avoid zeroing out too
+ // fast.
+ for (int i=0; i<size; ++i) {
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ }
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0);
+ CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1);
+
+
+ for (int i=0; i<size; ++i) {
+ data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) *
+ Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2))));
+ }
+
+#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11)
+ CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
+ CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+ CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
+#endif
+
+}
+
+namespace Eigen {
+namespace test {
+
+template<typename Scalar,typename PacketType, bool IsComplex, bool IsInteger>
+struct runall {
+ static void run() {
+ packetmath_real<Scalar,PacketType>();
+ }
+};
+
+}
+}
+
+EIGEN_DECLARE_TEST(special_packetmath)
+{
+ g_first_pass = true;
+ for(int i = 0; i < g_repeat; i++) {
+
+ CALL_SUBTEST_1( test::runner<float>::run() );
+ CALL_SUBTEST_2( test::runner<double>::run() );
+ CALL_SUBTEST_3( test::runner<Eigen::half>::run() );
+ CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() );
+ g_first_pass = false;
+ }
+}
diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp
index 3be020434..88ec87b97 100644
--- a/unsupported/test/splines.cpp
+++ b/unsupported/test/splines.cpp
@@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d()
}
}
-void test_splines()
+EIGEN_DECLARE_TEST(splines)
{
for (int i = 0; i < g_repeat; ++i)
{